Last active
April 30, 2022 05:40
-
-
Save johnmiedema/11224886 to your computer and use it in GitHub Desktop.
Use Apache Tika and Solr to index and search documents
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Use Apache Tika and Solr to crawl, index and search documents | |
//John Miedema http://johnmiedema.com | |
//----------------------------------------------------------- | |
//referenced libraries: | |
//Apache Tika 1.5 | |
//Apache Solr 4.7.2 | |
//Apache HttpClient 4.3.3 reqd to connect to Solr server | |
//Noggit json parser reqd for Solr commands | |
//----------------------------------------------------------- | |
//after Solr is downloaded, start it using the following commands | |
//cd path\solr-4.7.2\example | |
//java -jar start.jar | |
//----------------------------------------------------------- | |
package whatson2; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.util.UUID; | |
import org.apache.solr.client.solrj.SolrServer; | |
import org.apache.solr.client.solrj.impl.HttpSolrServer; | |
import org.apache.solr.common.SolrInputDocument; | |
import org.apache.tika.exception.TikaException; | |
import org.apache.tika.metadata.DublinCore; | |
import org.apache.tika.metadata.Metadata; | |
import org.apache.tika.mime.MimeTypes; | |
import org.apache.tika.parser.AutoDetectParser; | |
import org.apache.tika.parser.ParseContext; | |
import org.apache.tika.parser.Parser; | |
import org.apache.tika.sax.BodyContentHandler; | |
import org.xml.sax.ContentHandler; | |
import org.xml.sax.SAXException; | |
public class Main { | |
private static SolrServer solr; | |
public static void main(String[] args) throws IOException, SAXException, TikaException { | |
try { | |
solr = new HttpSolrServer("http://localhost:8983/solr/"); //create solr connection | |
solr.deleteByQuery( "*:*" ); //delete everything in the index; good for testing | |
//location of source documents | |
//later this will be switched to a database | |
String path = "C:\\content\\"; | |
String file_html = path + "mobydick.htm"; | |
String file_txt = path + "robinsoncrusoe.txt"; | |
String file_pdf = path + "callofthewild.pdf"; | |
processDocument(file_html); | |
processDocument(file_txt); | |
processDocument(file_pdf); | |
solr.commit(); //after all docs are added, commit to the index | |
//now you can search at http://localhost:8983/solr/browse | |
} | |
catch (Exception ex) { | |
System.out.println(ex.getMessage()); | |
} | |
} | |
private static void processDocument(String pathfilename) { | |
try { | |
InputStream input = new FileInputStream(new File(pathfilename)); | |
//use Apache Tika to convert documents in different formats to plain text | |
ContentHandler textHandler = new BodyContentHandler(10*1024*1024); | |
Metadata meta = new Metadata(); | |
Parser parser = new AutoDetectParser(); //handles documents in different formats: | |
ParseContext context = new ParseContext(); | |
parser.parse(input, textHandler, meta, context); //convert to plain text | |
//collect metadata and content from Tika and other sources | |
//document id must be unique, use guid | |
UUID guid = java.util.UUID.randomUUID(); | |
String docid = guid.toString(); | |
//Dublin Core metadata (partial set) | |
String doctitle = meta.get(DublinCore.TITLE); | |
String doccreator = meta.get(DublinCore.CREATOR); | |
//other metadata | |
String docurl = pathfilename; //document url | |
//content | |
String doccontent = textHandler.toString(); | |
//call to index | |
indexDocument(docid, doctitle, doccreator, docurl, doccontent); | |
} | |
catch (Exception ex) { | |
System.out.println(ex.getMessage()); | |
} | |
} | |
private static void indexDocument(String docid, String doctitle, String doccreator, String docurl, String doccontent) { | |
try { | |
SolrInputDocument doc = new SolrInputDocument(); | |
doc.addField("id", docid); | |
//map metadata fields to default schema | |
//location: path\solr-4.7.2\example\solr\collection1\conf\schema.xml | |
//Dublin Core | |
//thought: schema could be modified to use Dublin Core | |
doc.addField("title", doctitle); | |
doc.addField("author", doccreator); | |
//other metadata | |
doc.addField("url", docurl); | |
//content (and text) | |
//per schema, the content field is not indexed by default, used for returning and highlighting document content | |
//the schema "copyField" command automatically copies this to the "text" field which is indexed | |
doc.addField("content", doccontent); | |
//indexing | |
//when a field is indexed, like "text", Solr will handle tokenization, stemming, removal of stopwords etc, per the schema defn | |
//add to index | |
solr.add(doc); | |
} | |
catch (Exception ex) { | |
System.out.println(ex.getMessage()); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi johnmiedema,
Is it possible to run this program in windows environment ?