Home » lucene-3.0.1 » org.apache » lucene » demo » [javadoc | source]

    1   package org.apache.lucene.demo;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   import java.io;
   21   import org.apache.lucene.document;
   22   import org.apache.lucene.demo.html.HTMLParser;
   23   
   24   /** A utility for making Lucene Documents for HTML documents. */
   25   
   26   public class HTMLDocument {
   27     static char dirSep = System.getProperty("file.separator").charAt(0);
   28   
   29     public static String uid(File f) {
   30       // Append path and date into a string in such a way that lexicographic
   31       // sorting gives the same results as a walk of the file hierarchy.  Thus
   32       // null (\u0000) is used both to separate directory components and to
   33       // separate the path from the date.
   34       return f.getPath().replace(dirSep, '\u0000') +
   35         "\u0000" +
   36         DateTools.timeToString(f.lastModified(), DateTools.Resolution.SECOND);
   37     }
   38   
   39     public static String uid2url(String uid) {
   40       String url = uid.replace('\u0000', '/');	  // replace nulls with slashes
   41       return url.substring(0, url.lastIndexOf('/')); // remove date from end
   42     }
   43   
   44     public static Document Document(File f)
   45          throws IOException, InterruptedException  {
   46       // make a new, empty document
   47       Document doc = new Document();
   48   
   49       // Add the url as a field named "path".  Use a field that is 
   50       // indexed (i.e. searchable), but don't tokenize the field into words.
   51       doc.add(new Field("path", f.getPath().replace(dirSep, '/'), Field.Store.YES,
   52           Field.Index.NOT_ANALYZED));
   53   
   54       // Add the last modified date of the file a field named "modified".  
   55       // Use a field that is indexed (i.e. searchable), but don't tokenize
   56       // the field into words.
   57       doc.add(new Field("modified",
   58           DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
   59           Field.Store.YES, Field.Index.NOT_ANALYZED));
   60   
   61       // Add the uid as a field, so that index can be incrementally maintained.
   62       // This field is not stored with document, it is indexed, but it is not
   63       // tokenized prior to indexing.
   64       doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.NOT_ANALYZED));
   65   
   66       FileInputStream fis = new FileInputStream(f);
   67       HTMLParser parser = new HTMLParser(fis);
   68         
   69       // Add the tag-stripped contents as a Reader-valued Text field so it will
   70       // get tokenized and indexed.
   71       doc.add(new Field("contents", parser.getReader()));
   72   
   73       // Add the summary as a field that is stored and returned with
   74       // hit documents for display.
   75       doc.add(new Field("summary", parser.getSummary(), Field.Store.YES, Field.Index.NO));
   76   
   77       // Add the title as a field that it can be searched and that is stored.
   78       doc.add(new Field("title", parser.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
   79   
   80       // return the document
   81       return doc;
   82     }
   83   
   84     private HTMLDocument() {}
   85   }
   86       

Home » lucene-3.0.1 » org.apache » lucene » demo » [javadoc | source]