1 package org.apache.lucene.demo; 2 3 /** 4 * Licensed to the Apache Software Foundation (ASF) under one or more 5 * contributor license agreements. See the NOTICE file distributed with 6 * this work for additional information regarding copyright ownership. 7 * The ASF licenses this file to You under the Apache License, Version 2.0 8 * (the "License"); you may not use this file except in compliance with 9 * the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, software 14 * distributed under the License is distributed on an "AS IS" BASIS, 15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 20 import org.apache.lucene.analysis.standard.StandardAnalyzer; 21 import org.apache.lucene.document.Document; 22 import org.apache.lucene.index.IndexReader; 23 import org.apache.lucene.index.IndexWriter; 24 import org.apache.lucene.index.Term; 25 import org.apache.lucene.index.TermEnum; 26 import org.apache.lucene.store.FSDirectory; 27 import org.apache.lucene.util.Version; 28 29 import java.io.File; 30 import java.util.Date; 31 import java.util.Arrays; 32 33 /** Indexer for HTML files. */ 34 public class IndexHTML { 35 private IndexHTML() {} 36 37 private static boolean deleting = false; // true during deletion pass 38 private static IndexReader reader; // existing index 39 private static IndexWriter writer; // new index being built 40 private static TermEnum uidIter; // document id iterator 41 42 /** Indexer for HTML files.*/ 43 public static void main(String[] argv) { 44 try { 45 File index = new File("index"); 46 boolean create = false; 47 File root = null; 48 49 String usage = "IndexHTML [-create] [-index <index>] <root_directory>"; 50 51 if (argv.length == 0) { 52 System.err.println("Usage: " + usage); 53 return; 54 } 55 56 for (int i = 0; i < argv.length; i++) { 57 if (argv[i].equals("-index")) { // parse -index option 58 index = new File(argv[++i]); 59 } else if (argv[i].equals("-create")) { // parse -create option 60 create = true; 61 } else if (i != argv.length-1) { 62 System.err.println("Usage: " + usage); 63 return; 64 } else 65 root = new File(argv[i]); 66 } 67 68 if(root == null) { 69 System.err.println("Specify directory to index"); 70 System.err.println("Usage: " + usage); 71 return; 72 } 73 74 Date start = new Date(); 75 76 if (!create) { // delete stale docs 77 deleting = true; 78 indexDocs(root, index, create); 79 } 80 writer = new IndexWriter(FSDirectory.open(index), new StandardAnalyzer(Version.LUCENE_CURRENT), create, 81 new IndexWriter.MaxFieldLength(1000000)); 82 indexDocs(root, index, create); // add new docs 83 84 System.out.println("Optimizing index..."); 85 writer.optimize(); 86 writer.close(); 87 88 Date end = new Date(); 89 90 System.out.print(end.getTime() - start.getTime()); 91 System.out.println(" total milliseconds"); 92 93 } catch (Exception e) { 94 e.printStackTrace(); 95 } 96 } 97 98 /* Walk directory hierarchy in uid order, while keeping uid iterator from 99 /* existing index in sync. Mismatches indicate one of: (a) old documents to 100 /* be deleted; (b) unchanged documents, to be left alone; or (c) new 101 /* documents, to be indexed. 102 */ 103 104 private static void indexDocs(File file, File index, boolean create) 105 throws Exception { 106 if (!create) { // incrementally update 107 108 reader = IndexReader.open(FSDirectory.open(index), false); // open existing index 109 uidIter = reader.terms(new Term("uid", "")); // init uid iterator 110 111 indexDocs(file); 112 113 if (deleting) { // delete rest of stale docs 114 while (uidIter.term() != null && uidIter.term().field() == "uid") { 115 System.out.println("deleting " + 116 HTMLDocument.uid2url(uidIter.term().text())); 117 reader.deleteDocuments(uidIter.term()); 118 uidIter.next(); 119 } 120 deleting = false; 121 } 122 123 uidIter.close(); // close uid iterator 124 reader.close(); // close existing index 125 126 } else // don't have exisiting 127 indexDocs(file); 128 } 129 130 private static void indexDocs(File file) throws Exception { 131 if (file.isDirectory()) { // if a directory 132 String[] files = file.list(); // list its files 133 Arrays.sort(files); // sort the files 134 for (int i = 0; i < files.length; i++) // recursively index them 135 indexDocs(new File(file, files[i])); 136 137 } else if (file.getPath().endsWith(".html") || // index .html files 138 file.getPath().endsWith(".htm") || // index .htm files 139 file.getPath().endsWith(".txt")) { // index .txt files 140 141 if (uidIter != null) { 142 String uid = HTMLDocument.uid(file); // construct uid for doc 143 144 while (uidIter.term() != null && uidIter.term().field() == "uid" && 145 uidIter.term().text().compareTo(uid) < 0) { 146 if (deleting) { // delete stale docs 147 System.out.println("deleting " + 148 HTMLDocument.uid2url(uidIter.term().text())); 149 reader.deleteDocuments(uidIter.term()); 150 } 151 uidIter.next(); 152 } 153 if (uidIter.term() != null && uidIter.term().field() == "uid" && 154 uidIter.term().text().compareTo(uid) == 0) { 155 uidIter.next(); // keep matching docs 156 } else if (!deleting) { // add new docs 157 Document doc = HTMLDocument.Document(file); 158 System.out.println("adding " + doc.get("path")); 159 writer.addDocument(doc); 160 } 161 } else { // creating a new index 162 Document doc = HTMLDocument.Document(file); 163 System.out.println("adding " + doc.get("path")); 164 writer.addDocument(doc); // add docs unconditionally 165 } 166 } 167 } 168 }