1 package org.apache.lucene.demo; 2 3 /** 4 * Licensed to the Apache Software Foundation (ASF) under one or more 5 * contributor license agreements. See the NOTICE file distributed with 6 * this work for additional information regarding copyright ownership. 7 * The ASF licenses this file to You under the Apache License, Version 2.0 8 * (the "License"); you may not use this file except in compliance with 9 * the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, software 14 * distributed under the License is distributed on an "AS IS" BASIS, 15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 20 import java.io; 21 import org.apache.lucene.document; 22 import org.apache.lucene.demo.html.HTMLParser; 23 24 /** A utility for making Lucene Documents for HTML documents. */ 25 26 public class HTMLDocument { 27 static char dirSep = System.getProperty("file.separator").charAt(0); 28 29 public static String uid(File f) { 30 // Append path and date into a string in such a way that lexicographic 31 // sorting gives the same results as a walk of the file hierarchy. Thus 32 // null (\u0000) is used both to separate directory components and to 33 // separate the path from the date. 34 return f.getPath().replace(dirSep, '\u0000') + 35 "\u0000" + 36 DateTools.timeToString(f.lastModified(), DateTools.Resolution.SECOND); 37 } 38 39 public static String uid2url(String uid) { 40 String url = uid.replace('\u0000', '/'); // replace nulls with slashes 41 return url.substring(0, url.lastIndexOf('/')); // remove date from end 42 } 43 44 public static Document Document(File f) 45 throws IOException, InterruptedException { 46 // make a new, empty document 47 Document doc = new Document(); 48 49 // Add the url as a field named "path". Use a field that is 50 // indexed (i.e. searchable), but don't tokenize the field into words. 51 doc.add(new Field("path", f.getPath().replace(dirSep, '/'), Field.Store.YES, 52 Field.Index.NOT_ANALYZED)); 53 54 // Add the last modified date of the file a field named "modified". 55 // Use a field that is indexed (i.e. searchable), but don't tokenize 56 // the field into words. 57 doc.add(new Field("modified", 58 DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE), 59 Field.Store.YES, Field.Index.NOT_ANALYZED)); 60 61 // Add the uid as a field, so that index can be incrementally maintained. 62 // This field is not stored with document, it is indexed, but it is not 63 // tokenized prior to indexing. 64 doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.NOT_ANALYZED)); 65 66 FileInputStream fis = new FileInputStream(f); 67 HTMLParser parser = new HTMLParser(fis); 68 69 // Add the tag-stripped contents as a Reader-valued Text field so it will 70 // get tokenized and indexed. 71 doc.add(new Field("contents", parser.getReader())); 72 73 // Add the summary as a field that is stored and returned with 74 // hit documents for display. 75 doc.add(new Field("summary", parser.getSummary(), Field.Store.YES, Field.Index.NO)); 76 77 // Add the title as a field that it can be searched and that is stored. 78 doc.add(new Field("title", parser.getTitle(), Field.Store.YES, Field.Index.ANALYZED)); 79 80 // return the document 81 return doc; 82 } 83 84 private HTMLDocument() {} 85 } 86