| Method from org.apache.nutch.crawl.LinkDb Detail: |
public void close() {
}
|
public void configure(JobConf job) {
maxAnchorLength = job.getInt("db.max.anchor.length", 100);
ignoreInternalLinks = job.getBoolean("db.ignore.internal.links", true);
if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) {
urlFilters = new URLFilters(job);
}
if (job.getBoolean(LinkDbFilter.URL_NORMALIZING, false)) {
urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_LINKDB);
}
}
|
public static void install(JobConf job,
Path linkDb) throws IOException {
Path newLinkDb = FileOutputFormat.getOutputPath(job);
FileSystem fs = new JobClient(job).getFs();
Path old = new Path(linkDb, "old");
Path current = new Path(linkDb, CURRENT_NAME);
if (fs.exists(current)) {
if (fs.exists(old)) fs.delete(old, true);
fs.rename(current, old);
}
fs.mkdirs(linkDb);
fs.rename(newLinkDb, current);
if (fs.exists(old)) fs.delete(old, true);
LockUtil.removeLockFile(fs, new Path(linkDb, LOCK_NAME));
}
|
public void invert(Path linkDb,
Path segmentsDir,
boolean normalize,
boolean filter,
boolean force) throws IOException {
final FileSystem fs = FileSystem.get(getConf());
FileStatus[] files = fs.listStatus(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
invert(linkDb, HadoopFSUtil.getPaths(files), normalize, filter, force);
}
|
public void invert(Path linkDb,
Path[] segments,
boolean normalize,
boolean filter,
boolean force) throws IOException {
Path lock = new Path(linkDb, LOCK_NAME);
FileSystem fs = FileSystem.get(getConf());
LockUtil.createLockFile(fs, lock, force);
Path currentLinkDb = new Path(linkDb, CURRENT_NAME);
if (LOG.isInfoEnabled()) {
LOG.info("LinkDb: starting");
LOG.info("LinkDb: linkdb: " + linkDb);
LOG.info("LinkDb: URL normalize: " + normalize);
LOG.info("LinkDb: URL filter: " + filter);
}
JobConf job = LinkDb.createJob(getConf(), linkDb, normalize, filter);
for (int i = 0; i < segments.length; i++) {
if (LOG.isInfoEnabled()) {
LOG.info("LinkDb: adding segment: " + segments[i]);
}
FileInputFormat.addInputPath(job, new Path(segments[i], ParseData.DIR_NAME));
}
try {
JobClient.runJob(job);
} catch (IOException e) {
LockUtil.removeLockFile(fs, lock);
throw e;
}
if (fs.exists(currentLinkDb)) {
if (LOG.isInfoEnabled()) {
LOG.info("LinkDb: merging with existing linkdb: " + linkDb);
}
// try to merge
Path newLinkDb = FileOutputFormat.getOutputPath(job);
job = LinkDbMerger.createMergeJob(getConf(), linkDb, normalize, filter);
FileInputFormat.addInputPath(job, currentLinkDb);
FileInputFormat.addInputPath(job, newLinkDb);
try {
JobClient.runJob(job);
} catch (IOException e) {
LockUtil.removeLockFile(fs, lock);
fs.delete(newLinkDb, true);
throw e;
}
fs.delete(newLinkDb, true);
}
LinkDb.install(job, linkDb);
if (LOG.isInfoEnabled()) { LOG.info("LinkDb: done"); }
}
|
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new LinkDb(), args);
System.exit(res);
}
|
public void map(Text key,
ParseData parseData,
OutputCollector output,
Reporter reporter) throws IOException {
String fromUrl = key.toString();
String fromHost = getHost(fromUrl);
if (urlNormalizers != null) {
try {
fromUrl = urlNormalizers.normalize(fromUrl, URLNormalizers.SCOPE_LINKDB); // normalize the url
} catch (Exception e) {
LOG.warn("Skipping " + fromUrl + ":" + e);
fromUrl = null;
}
}
if (fromUrl != null && urlFilters != null) {
try {
fromUrl = urlFilters.filter(fromUrl); // filter the url
} catch (Exception e) {
LOG.warn("Skipping " + fromUrl + ":" + e);
fromUrl = null;
}
}
if (fromUrl == null) return; // discard all outlinks
Outlink[] outlinks = parseData.getOutlinks();
Inlinks inlinks = new Inlinks();
for (int i = 0; i < outlinks.length; i++) {
Outlink outlink = outlinks[i];
String toUrl = outlink.getToUrl();
if (ignoreInternalLinks) {
String toHost = getHost(toUrl);
if (toHost == null || toHost.equals(fromHost)) { // internal link
continue; // skip it
}
}
if (urlNormalizers != null) {
try {
toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_LINKDB); // normalize the url
} catch (Exception e) {
LOG.warn("Skipping " + toUrl + ":" + e);
toUrl = null;
}
}
if (toUrl != null && urlFilters != null) {
try {
toUrl = urlFilters.filter(toUrl); // filter the url
} catch (Exception e) {
LOG.warn("Skipping " + toUrl + ":" + e);
toUrl = null;
}
}
if (toUrl == null) continue;
inlinks.clear();
String anchor = outlink.getAnchor(); // truncate long anchors
if (anchor.length() > maxAnchorLength) {
anchor = anchor.substring(0, maxAnchorLength);
}
inlinks.add(new Inlink(fromUrl, anchor)); // collect inverted link
output.collect(new Text(toUrl), inlinks);
}
}
|
public int run(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Usage: LinkDb < linkdb > (-dir < segmentsDir > | < seg1 > < seg2 > ...) [-force] [-noNormalize] [-noFilter]");
System.err.println("\tlinkdb\toutput LinkDb to create or update");
System.err.println("\t-dir segmentsDir\tparent directory of several segments, OR");
System.err.println("\tseg1 seg2 ...\t list of segment directories");
System.err.println("\t-force\tforce update even if LinkDb appears to be locked (CAUTION advised)");
System.err.println("\t-noNormalize\tdon't normalize link URLs");
System.err.println("\t-noFilter\tdon't apply URLFilters to link URLs");
return -1;
}
Path segDir = null;
final FileSystem fs = FileSystem.get(getConf());
Path db = new Path(args[0]);
ArrayList< Path > segs = new ArrayList< Path >();
boolean filter = true;
boolean normalize = true;
boolean force = false;
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-dir")) {
segDir = new Path(args[++i]);
FileStatus[] files = fs.listStatus(segDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
if (files != null) segs.addAll(Arrays.asList(HadoopFSUtil.getPaths(files)));
break;
} else if (args[i].equalsIgnoreCase("-noNormalize")) {
normalize = false;
} else if (args[i].equalsIgnoreCase("-noFilter")) {
filter = false;
} else if (args[i].equalsIgnoreCase("-force")) {
force = true;
} else segs.add(new Path(args[i]));
}
try {
invert(db, segs.toArray(new Path[segs.size()]), normalize, filter, force);
return 0;
} catch (Exception e) {
LOG.fatal("LinkDb: " + StringUtils.stringifyException(e));
return -1;
}
}
|