本文共 7598 字,大约阅读时间需要 25 分钟。
3。用apache的poi来抽取excel。
import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;
import org.apache.poi.hwpf.extractor.WordExtractor;import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFCell;
import java.io.File;import java.io.InputStream;import java.io.FileInputStream;
import com.search.code.Index;
public Document getDocument(Index index, String url, String title, InputStream is) throws DocCenterException { StringBuffer content = new StringBuffer(); try{ HSSFWorkbook workbook = new HSSFWorkbook(is);//创建对Excel工作簿文件的引用 for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) { if (null != workbook.getSheetAt(numSheets)) { HSSFSheet aSheet = workbook.getSheetAt(numSheets);//获得一个sheet for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) { if (null != aSheet.getRow(rowNumOfSheet)) { HSSFRow aRow = aSheet.getRow(rowNumOfSheet); //获得一个行 for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) { if (null != aRow.getCell(cellNumOfRow)) { HSSFCell aCell = aRow.getCell(cellNumOfRow);//获得列值 content.append(aCell.getStringCellValue()); } } } } } } if(!content.equals("")){ index.AddIndex(url, title, content.toString()); } }catch (DocCenterException e) {
throw new DocCenterException("无法从该Mocriosoft Word文档中提取内容", e); }catch(Exception e) { System.out.println("已运行xlRead() : " + e ); } return null;}
4。用apache的poi来抽取powerpoint.
import java.io.InputStream;
import org.apache.lucene.document.Document;import org.apache.poi.hslf.HSLFSlideShow;import org.apache.poi.hslf.model.TextRun;import org.apache.poi.hslf.model.Slide;import org.apache.poi.hslf.usermodel.SlideShow;
public Document getDocument(Index index, String url, String title, InputStream is)throws DocCenterException { StringBuffer content = new StringBuffer(""); try{ SlideShow ss = new SlideShow(new HSLFSlideShow(is));//is 为文件的InputStream,建立SlideShow Slide[] slides = ss.getSlides();//获得每一张幻灯片 for(int i = 0;i < slides.length;i++){
TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun for(int j = 0;j < t.length;j++){
content.append(t[j].getText());//这里会将文字内容加到content中去 } content.append(slides[i].getTitle()); } index.AddIndex(url, title, content.toString()); }catch(Exception ex){ System.out.println(ex.toString()); } return null;}
5。pdfbox-用来抽取pdf文件
但是pdfbox对中文支持还不好,先下载pdfbox:http://www.matrix.org.cn/down_view.asp?id=12下面是一个如何使用pdfbox抽取pdf文件的例子:import org.pdfbox.pdmodel.PDDocument;import org.pdfbox.pdfparser.PDFParser;import java.io.*;import org.pdfbox.util.PDFTextStripper;import java.util.Date;/*** <p>Title: pdf extraction</p>* <p>Description: email:chris@matrix.org.cn</p>* <p>Copyright: Matrix Copyright (c) 2003</p>* <p>Company: Matrix.org.cn</p>* @author chris* @version 1.0,who use this example pls remain the declare*/public class PdfExtracter{ public PdfExtracter(){ }public String GetTextFromPdf(String filename) throws Exception{ String temp=null;PDDocument pdfdocument=null;FileInputStream is=new FileInputStream(filename);PDFParser parser = new PDFParser( is );parser.parse();pdfdocument = parser.getPDDocument();ByteArrayOutputStream out = new ByteArrayOutputStream();OutputStreamWriter writer = new OutputStreamWriter( out );PDFTextStripper stripper = new PDFTextStripper();stripper.writeText(pdfdocument.getDocument(), writer );writer.close();byte[] contents = out.toByteArray();String ts=new String(contents);System.out.println("the string length is"+contents.length+"/n");return ts;}public static void main(String args[]){ PdfExtracter pf=new PdfExtracter();PDDocument pdfDocument = null;try{ String ts=pf.GetTextFromPdf("c://a.pdf");System.out.println(ts);}catch(Exception e){ e.printStackTrace();}}}6.抽取支持中文的pdf文件-xpdf
xpdf是一个开源项目,我们可以调用他的本地方法来实现抽取中文pdf文件。下载xpdf函数包:http://www.matrix.org.cn/down_view.asp?id=15同时需要下载支持中文的补丁包:http://www.matrix.org.cn/down_view.asp?id=16按照readme放好中文的patch,就可以开始写调用本地方法的java程序了下面是一个如何调用的例子:import java.io.*;/*** <p>Title: pdf extraction</p>* <p>Description: email:chris@matrix.org.cn</p>* <p>Copyright: Matrix Copyright (c) 2003</p>* <p>Company: Matrix.org.cn</p>* @author chris* @version 1.0,who use this example pls remain the declare*/public class PdfWin { public PdfWin() { }public static void main(String args[]) throws Exception{ String PATH_TO_XPDF="C://Program Files//xpdf//pdftotext.exe";String filename="c://a.pdf";String[] cmd = new String[] { PATH_TO_XPDF, "-enc", "UTF-8", "-q", filename, "-"};Process p = Runtime.getRuntime().exec(cmd);BufferedInputStream bis = new BufferedInputStream(p.getInputStream());InputStreamReader reader = new InputStreamReader(bis, "UTF-8");StringWriter out = new StringWriter();char [] buf = new char[10000];int len;while((len = reader.read(buf))>= 0) { //out.write(buf, 0, len);System.out.println("the length is"+len);}reader.close();String ts=new String(buf);System.out.println("the str is"+ts);}}代码复制可能出错,不过代码经过测试,绝对能用,POI为3.0-rc4,PDFBOX为0.7.3POI:
PDFBOX:
转载地址:https://blog.csdn.net/thamsyangsw/article/details/4209037 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!