Home » lucene-3.0.1 » org.apache » lucene » demo » [javadoc | source]

    1   package org.apache.lucene.demo;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   import org.apache.lucene.analysis.standard.StandardAnalyzer;
   21   import org.apache.lucene.document.Document;
   22   import org.apache.lucene.index.IndexReader;
   23   import org.apache.lucene.index.IndexWriter;
   24   import org.apache.lucene.index.Term;
   25   import org.apache.lucene.index.TermEnum;
   26   import org.apache.lucene.store.FSDirectory;
   27   import org.apache.lucene.util.Version;
   28   
   29   import java.io.File;
   30   import java.util.Date;
   31   import java.util.Arrays;
   32   
   33   /** Indexer for HTML files. */
   34   public class IndexHTML {
   35     private IndexHTML() {}
   36   
   37     private static boolean deleting = false;	  // true during deletion pass
   38     private static IndexReader reader;		  // existing index
   39     private static IndexWriter writer;		  // new index being built
   40     private static TermEnum uidIter;		  // document id iterator
   41   
   42     /** Indexer for HTML files.*/
   43     public static void main(String[] argv) {
   44       try {
   45         File index = new File("index");
   46         boolean create = false;
   47         File root = null;
   48   
   49         String usage = "IndexHTML [-create] [-index <index>] <root_directory>";
   50   
   51         if (argv.length == 0) {
   52           System.err.println("Usage: " + usage);
   53           return;
   54         }
   55   
   56         for (int i = 0; i < argv.length; i++) {
   57           if (argv[i].equals("-index")) {		  // parse -index option
   58             index = new File(argv[++i]);
   59           } else if (argv[i].equals("-create")) {	  // parse -create option
   60             create = true;
   61           } else if (i != argv.length-1) {
   62             System.err.println("Usage: " + usage);
   63             return;
   64           } else
   65             root = new File(argv[i]);
   66         }
   67         
   68         if(root == null) {
   69           System.err.println("Specify directory to index");
   70           System.err.println("Usage: " + usage);
   71           return;
   72         }
   73   
   74         Date start = new Date();
   75   
   76         if (!create) {				  // delete stale docs
   77           deleting = true;
   78           indexDocs(root, index, create);
   79         }
   80         writer = new IndexWriter(FSDirectory.open(index), new StandardAnalyzer(Version.LUCENE_CURRENT), create, 
   81                                  new IndexWriter.MaxFieldLength(1000000));
   82         indexDocs(root, index, create);		  // add new docs
   83   
   84         System.out.println("Optimizing index...");
   85         writer.optimize();
   86         writer.close();
   87   
   88         Date end = new Date();
   89   
   90         System.out.print(end.getTime() - start.getTime());
   91         System.out.println(" total milliseconds");
   92   
   93       } catch (Exception e) {
   94         e.printStackTrace();
   95       }
   96     }
   97   
   98     /* Walk directory hierarchy in uid order, while keeping uid iterator from
   99     /* existing index in sync.  Mismatches indicate one of: (a) old documents to
  100     /* be deleted; (b) unchanged documents, to be left alone; or (c) new
  101     /* documents, to be indexed.
  102      */
  103   
  104     private static void indexDocs(File file, File index, boolean create)
  105          throws Exception {
  106       if (!create) {				  // incrementally update
  107   
  108         reader = IndexReader.open(FSDirectory.open(index), false);		  // open existing index
  109         uidIter = reader.terms(new Term("uid", "")); // init uid iterator
  110   
  111         indexDocs(file);
  112   
  113         if (deleting) {				  // delete rest of stale docs
  114           while (uidIter.term() != null && uidIter.term().field() == "uid") {
  115             System.out.println("deleting " +
  116                 HTMLDocument.uid2url(uidIter.term().text()));
  117             reader.deleteDocuments(uidIter.term());
  118             uidIter.next();
  119           }
  120           deleting = false;
  121         }
  122   
  123         uidIter.close();				  // close uid iterator
  124         reader.close();				  // close existing index
  125   
  126       } else					  // don't have exisiting
  127         indexDocs(file);
  128     }
  129   
  130     private static void indexDocs(File file) throws Exception {
  131       if (file.isDirectory()) {			  // if a directory
  132         String[] files = file.list();		  // list its files
  133         Arrays.sort(files);			  // sort the files
  134         for (int i = 0; i < files.length; i++)	  // recursively index them
  135           indexDocs(new File(file, files[i]));
  136   
  137       } else if (file.getPath().endsWith(".html") || // index .html files
  138         file.getPath().endsWith(".htm") || // index .htm files
  139         file.getPath().endsWith(".txt")) { // index .txt files
  140   
  141         if (uidIter != null) {
  142           String uid = HTMLDocument.uid(file);	  // construct uid for doc
  143   
  144           while (uidIter.term() != null && uidIter.term().field() == "uid" &&
  145               uidIter.term().text().compareTo(uid) < 0) {
  146             if (deleting) {			  // delete stale docs
  147               System.out.println("deleting " +
  148                   HTMLDocument.uid2url(uidIter.term().text()));
  149               reader.deleteDocuments(uidIter.term());
  150             }
  151             uidIter.next();
  152           }
  153           if (uidIter.term() != null && uidIter.term().field() == "uid" &&
  154               uidIter.term().text().compareTo(uid) == 0) {
  155             uidIter.next();			  // keep matching docs
  156           } else if (!deleting) {			  // add new docs
  157             Document doc = HTMLDocument.Document(file);
  158             System.out.println("adding " + doc.get("path"));
  159             writer.addDocument(doc);
  160           }
  161         } else {					  // creating a new index
  162           Document doc = HTMLDocument.Document(file);
  163           System.out.println("adding " + doc.get("path"));
  164           writer.addDocument(doc);		  // add docs unconditionally
  165         }
  166       }
  167     }
  168   }

Home » lucene-3.0.1 » org.apache » lucene » demo » [javadoc | source]