java如何實現查找PDF關鍵字所在頁碼及其坐標

發布時間：2021-04-15 10:35:42 來源：億速云閱讀：334 作者：小新欄目：編程語言

這篇文章主要介紹java如何實現查找PDF關鍵字所在頁碼及其坐標，文中介紹的非常詳細，具有一定的參考價值，感興趣的小伙伴們一定要看完！

1、因為最近有這方面的需求，用過之后記錄一下。

2、此功能跟PDF中Ctrl+F性質一樣，如果PDF中為圖片形式的不支持定位到關鍵字。

import com.itextpdf.awt.geom.Rectangle2D.Float;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.*;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
 * 消失的太陽
 */
public class MyTest {
 public static void main(String[] args) throws IOException {
  //1.給定文件
  File pdfFile = new File("D://test.pdf");
  //2.定義一個byte數組，長度為文件的長度
  byte[] pdfData = new byte[(int) pdfFile.length()];
  //3.IO流讀取文件內容到byte數組
  FileInputStream inputStream = null;
  try {
   inputStream = new FileInputStream(pdfFile);
   inputStream.read(pdfData);
  } catch (IOException e) {
   throw e;
  } finally {
   if (inputStream != null) {
    try {
     inputStream.close();
    } catch (IOException e) {
    }
   }
  }
  //4.指定關鍵字
  String keyword = "消失的太陽：";
  //5.調用方法，給定關鍵字和文件
  List<float[]> positions = findKeywordPostions(pdfData, keyword);
  //6.返回值類型是 List<float[]> 每個list元素代表一個匹配的位置，分別為 float[0]所在頁碼 float[1]所在x軸 float[2]所在y軸
  System.out.println("total:" + positions.size());
  if (positions != null && positions.size() > 0) {
   for (float[] position : positions) {
    System.out.print("pageNum: " + (int) position[0]);
    System.out.print("\tx: " + position[1]);
    System.out.println("\ty: " + position[2]);
   }
  }
 }
 /**
  * findKeywordPostions
  * @param pdfData  通過IO流 PDF文件轉化的byte數組
  * @param keyword  關鍵字
  * @return List<float [ ]> : float[0]:pageNum float[1]:x float[2]:y
  * @throws IOException
  */
 public static List<float[]> findKeywordPostions(byte[] pdfData, String keyword) throws IOException {
  List<float[]> result = new ArrayList<>();
  List<PdfPageContentPositions> pdfPageContentPositions = getPdfContentPostionsList(pdfData);
  for (PdfPageContentPositions pdfPageContentPosition : pdfPageContentPositions) {
   List<float[]> charPositions = findPositions(keyword, pdfPageContentPosition);
   if (charPositions == null || charPositions.size() < 1) {
    continue;
   }
   result.addAll(charPositions);
  }
  return result;
 }
 private static List<PdfPageContentPositions> getPdfContentPostionsList(byte[] pdfData) throws IOException {
  PdfReader reader = new PdfReader(pdfData);
  List<PdfPageContentPositions> result = new ArrayList<>();
  int pages = reader.getNumberOfPages();
  for (int pageNum = 1; pageNum <= pages; pageNum++) {
   float width = reader.getPageSize(pageNum).getWidth();
   float height = reader.getPageSize(pageNum).getHeight();
   PdfRenderListener pdfRenderListener = new PdfRenderListener(pageNum, width, height);
   //解析pdf，定位位置
   PdfContentStreamProcessor processor = new PdfContentStreamProcessor(pdfRenderListener);
   PdfDictionary pageDic = reader.getPageN(pageNum);
   PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);
   try {
    processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);
   } catch (IOException e) {
    reader.close();
    throw e;
   }
   String content = pdfRenderListener.getContent();
   List<CharPosition> charPositions = pdfRenderListener.getcharPositions();
   List<float[]> positionsList = new ArrayList<>();
   for (CharPosition charPosition : charPositions) {
    float[] positions = new float[]{charPosition.getPageNum(), charPosition.getX(), charPosition.getY()};
    positionsList.add(positions);
   }
   PdfPageContentPositions pdfPageContentPositions = new PdfPageContentPositions();
   pdfPageContentPositions.setContent(content);
   pdfPageContentPositions.setPostions(positionsList);
   result.add(pdfPageContentPositions);
  }
  reader.close();
  return result;
 }
 private static List<float[]> findPositions(String keyword, PdfPageContentPositions pdfPageContentPositions) {
  List<float[]> result = new ArrayList<>();
  String content = pdfPageContentPositions.getContent();
  List<float[]> charPositions = pdfPageContentPositions.getPositions();
  for (int pos = 0; pos < content.length(); ) {
   int positionIndex = content.indexOf(keyword, pos);
   if (positionIndex == -1) {
    break;
   }
   float[] postions = charPositions.get(positionIndex);
   result.add(postions);
   pos = positionIndex + 1;
  }
  return result;
 }
 private static class PdfPageContentPositions {
  private String content;
  private List<float[]> positions;
  public String getContent() {
   return content;
  }
  public void setContent(String content) {
   this.content = content;
  }
  public List<float[]> getPositions() {
   return positions;
  }
  public void setPostions(List<float[]> positions) {
   this.positions = positions;
  }
 }
 private static class PdfRenderListener implements RenderListener {
  private int pageNum;
  private float pageWidth;
  private float pageHeight;
  private StringBuilder contentBuilder = new StringBuilder();
  private List<CharPosition> charPositions = new ArrayList<>();
  public PdfRenderListener(int pageNum, float pageWidth, float pageHeight) {
   this.pageNum = pageNum;
   this.pageWidth = pageWidth;
   this.pageHeight = pageHeight;
  }
  public void beginTextBlock() {
  }
  public void renderText(TextRenderInfo renderInfo) {
   List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos();
   for (TextRenderInfo textRenderInfo : characterRenderInfos) {
    String word = textRenderInfo.getText();
    if (word.length() > 1) {
     word = word.substring(word.length() - 1, word.length());
    }
    Float rectangle = textRenderInfo.getAscentLine().getBoundingRectange();
    float x = (float)rectangle.getX();
    float y = (float)rectangle.getY();
//    float x = (float)rectangle.getCenterX();
//    float y = (float)rectangle.getCenterY();
//    double x = rectangle.getMinX();
//    double y = rectangle.getMaxY();
    //這兩個是關鍵字在所在頁面的XY軸的百分比
    float xPercent = Math.round(x / pageWidth * 10000) / 10000f;
    float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f;
//    CharPosition charPosition = new CharPosition(pageNum, xPercent, yPercent);
    CharPosition charPosition = new CharPosition(pageNum, (float)x, (float)y);
    charPositions.add(charPosition);
    contentBuilder.append(word);
   }
  }
  public void endTextBlock() {
  }
  public void renderImage(ImageRenderInfo renderInfo) {
  }
  public String getContent() {
   return contentBuilder.toString();
  }
  public List<CharPosition> getcharPositions() {
   return charPositions;
  }
 }
 private static class CharPosition {
  private int pageNum = 0;
  private float x = 0;
  private float y = 0;
  public CharPosition(int pageNum, float x, float y) {
   this.pageNum = pageNum;
   this.x = x;
   this.y = y;
  }
  public int getPageNum() {
   return pageNum;
  }
  public float getX() {
   return x;
  }
  public float getY() {
   return y;
  }
  @Override
  public String toString() {
   return "[pageNum=" + this.pageNum + ",x=" + this.x + ",y=" + this.y + "]";
  }
 }
}

以上是“java如何實現查找PDF關鍵字所在頁碼及其坐標”這篇文章的所有內容，感謝各位的閱讀！希望分享的內容對大家有幫助，更多相關知識，歡迎關注億速云行業資訊頻道！

向AI問一下細節

91超碰碰碰碰久久久久久综合_超碰av人澡人澡人澡人澡人掠_国产黄大片在线观看画质优化_txt小说免费全本

java如何實現查找PDF關鍵字所在頁碼及其坐標

猜你喜歡

91超碰碰碰碰久久久久久综合_超碰av人澡人澡人澡人澡人掠_国产黄大片在线观看画质优化_txt小说免费全本

java如何實現查找PDF關鍵字所在頁碼及其坐標

猜你喜歡

最新資訊

相關推薦

相關標簽