您好,登錄后才能下訂單哦!
這篇文章主要介紹java如何實現查找PDF關鍵字所在頁碼及其坐標,文中介紹的非常詳細,具有一定的參考價值,感興趣的小伙伴們一定要看完!
1、因為最近有這方面的需求,用過之后記錄一下。
2、此功能跟PDF中Ctrl+F性質一樣,如果PDF中為圖片形式的不支持定位到關鍵字。
import com.itextpdf.awt.geom.Rectangle2D.Float; import com.itextpdf.text.pdf.PdfDictionary; import com.itextpdf.text.pdf.PdfName; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.*; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; /** * 消失的太陽 */ public class MyTest { public static void main(String[] args) throws IOException { //1.給定文件 File pdfFile = new File("D://test.pdf"); //2.定義一個byte數組,長度為文件的長度 byte[] pdfData = new byte[(int) pdfFile.length()]; //3.IO流讀取文件內容到byte數組 FileInputStream inputStream = null; try { inputStream = new FileInputStream(pdfFile); inputStream.read(pdfData); } catch (IOException e) { throw e; } finally { if (inputStream != null) { try { inputStream.close(); } catch (IOException e) { } } } //4.指定關鍵字 String keyword = "消失的太陽:"; //5.調用方法,給定關鍵字和文件 List<float[]> positions = findKeywordPostions(pdfData, keyword); //6.返回值類型是 List<float[]> 每個list元素代表一個匹配的位置,分別為 float[0]所在頁碼 float[1]所在x軸 float[2]所在y軸 System.out.println("total:" + positions.size()); if (positions != null && positions.size() > 0) { for (float[] position : positions) { System.out.print("pageNum: " + (int) position[0]); System.out.print("\tx: " + position[1]); System.out.println("\ty: " + position[2]); } } } /** * findKeywordPostions * @param pdfData 通過IO流 PDF文件轉化的byte數組 * @param keyword 關鍵字 * @return List<float [ ]> : float[0]:pageNum float[1]:x float[2]:y * @throws IOException */ public static List<float[]> findKeywordPostions(byte[] pdfData, String keyword) throws IOException { List<float[]> result = new ArrayList<>(); List<PdfPageContentPositions> pdfPageContentPositions = getPdfContentPostionsList(pdfData); for (PdfPageContentPositions pdfPageContentPosition : pdfPageContentPositions) { List<float[]> charPositions = findPositions(keyword, pdfPageContentPosition); if (charPositions == null || charPositions.size() < 1) { continue; } result.addAll(charPositions); } return result; } private static List<PdfPageContentPositions> getPdfContentPostionsList(byte[] pdfData) throws IOException { PdfReader reader = new PdfReader(pdfData); List<PdfPageContentPositions> result = new ArrayList<>(); int pages = reader.getNumberOfPages(); for (int pageNum = 1; pageNum <= pages; pageNum++) { float width = reader.getPageSize(pageNum).getWidth(); float height = reader.getPageSize(pageNum).getHeight(); PdfRenderListener pdfRenderListener = new PdfRenderListener(pageNum, width, height); //解析pdf,定位位置 PdfContentStreamProcessor processor = new PdfContentStreamProcessor(pdfRenderListener); PdfDictionary pageDic = reader.getPageN(pageNum); PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES); try { processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic); } catch (IOException e) { reader.close(); throw e; } String content = pdfRenderListener.getContent(); List<CharPosition> charPositions = pdfRenderListener.getcharPositions(); List<float[]> positionsList = new ArrayList<>(); for (CharPosition charPosition : charPositions) { float[] positions = new float[]{charPosition.getPageNum(), charPosition.getX(), charPosition.getY()}; positionsList.add(positions); } PdfPageContentPositions pdfPageContentPositions = new PdfPageContentPositions(); pdfPageContentPositions.setContent(content); pdfPageContentPositions.setPostions(positionsList); result.add(pdfPageContentPositions); } reader.close(); return result; } private static List<float[]> findPositions(String keyword, PdfPageContentPositions pdfPageContentPositions) { List<float[]> result = new ArrayList<>(); String content = pdfPageContentPositions.getContent(); List<float[]> charPositions = pdfPageContentPositions.getPositions(); for (int pos = 0; pos < content.length(); ) { int positionIndex = content.indexOf(keyword, pos); if (positionIndex == -1) { break; } float[] postions = charPositions.get(positionIndex); result.add(postions); pos = positionIndex + 1; } return result; } private static class PdfPageContentPositions { private String content; private List<float[]> positions; public String getContent() { return content; } public void setContent(String content) { this.content = content; } public List<float[]> getPositions() { return positions; } public void setPostions(List<float[]> positions) { this.positions = positions; } } private static class PdfRenderListener implements RenderListener { private int pageNum; private float pageWidth; private float pageHeight; private StringBuilder contentBuilder = new StringBuilder(); private List<CharPosition> charPositions = new ArrayList<>(); public PdfRenderListener(int pageNum, float pageWidth, float pageHeight) { this.pageNum = pageNum; this.pageWidth = pageWidth; this.pageHeight = pageHeight; } public void beginTextBlock() { } public void renderText(TextRenderInfo renderInfo) { List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos(); for (TextRenderInfo textRenderInfo : characterRenderInfos) { String word = textRenderInfo.getText(); if (word.length() > 1) { word = word.substring(word.length() - 1, word.length()); } Float rectangle = textRenderInfo.getAscentLine().getBoundingRectange(); float x = (float)rectangle.getX(); float y = (float)rectangle.getY(); // float x = (float)rectangle.getCenterX(); // float y = (float)rectangle.getCenterY(); // double x = rectangle.getMinX(); // double y = rectangle.getMaxY(); //這兩個是關鍵字在所在頁面的XY軸的百分比 float xPercent = Math.round(x / pageWidth * 10000) / 10000f; float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f; // CharPosition charPosition = new CharPosition(pageNum, xPercent, yPercent); CharPosition charPosition = new CharPosition(pageNum, (float)x, (float)y); charPositions.add(charPosition); contentBuilder.append(word); } } public void endTextBlock() { } public void renderImage(ImageRenderInfo renderInfo) { } public String getContent() { return contentBuilder.toString(); } public List<CharPosition> getcharPositions() { return charPositions; } } private static class CharPosition { private int pageNum = 0; private float x = 0; private float y = 0; public CharPosition(int pageNum, float x, float y) { this.pageNum = pageNum; this.x = x; this.y = y; } public int getPageNum() { return pageNum; } public float getX() { return x; } public float getY() { return y; } @Override public String toString() { return "[pageNum=" + this.pageNum + ",x=" + this.x + ",y=" + this.y + "]"; } } }
以上是“java如何實現查找PDF關鍵字所在頁碼及其坐標”這篇文章的所有內容,感謝各位的閱讀!希望分享的內容對大家有幫助,更多相關知識,歡迎關注億速云行業資訊頻道!
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。