# To read the PDF import PyPDF2 # To analyze the PDF layout and extract text from pdfminer.high_level import extract_pages, extract_text from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure # To extract text from tables in PDF import pdfplumber # To extract the images from the PDFs from PIL import Image from pdf2image import
convert_from_path # To perform OCR to extract text from images import pytesseract # To remove the additional created files import os
for pagenum, page in enumerate(extract_pages(pdf_path)):
# Iterate the elements that composed a page for element in page:
# Check if the element is a text element if isinstance(element, LTTextContainer): # Function to extract text from the text block pass # Function to extract text format pass
# Check the elements for images if isinstance(element, LTFigure): # Function to convert PDF to Image pass # Function to extract text with OCR pass
# Check the elements for tables if isinstance(element, LTRect): # Function to extract table pass # Function to convert table content into a string pass
因此,现在我们理解了流程分析的部分,让我们创建从每个组件中提取文本所需的函数。
定义从PDF中提取文本的函数
从这里开始,从文本容器中提取文本非常简单。
# Create a function to extract text
deftext_extraction(element): # Extracting the text from the in-line text element line_text = element.get_text()
# Find the formats of the text # Initialize the list with all the formats that appeared in the line of text line_formats = [] for text_line in element: if isinstance(text_line, LTTextContainer): # Iterating through each character in the line of text for character in text_line: if isinstance(character, LTChar): # Append the font name of the character line_formats.append(character.fontname) # Append the font size of the character line_formats.append(character.size) # Find the unique font sizes and names in the line format_per_line = list(set(line_formats))
# Return a tuple with the text in each line along with its format return (line_text, format_per_line)
# Create a function to crop the image elements from PDFs defcrop_image(element, pageObj): # Get the coordinates to crop the image from the PDF [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1] # Crop the page using coordinates (left, bottom, right, top) pageObj.mediabox.lower_left = (image_left, image_bottom) pageObj.mediabox.upper_right = (image_right, image_top) # Save the cropped page to a new PDF cropped_pdf_writer = PyPDF2.PdfWriter() cropped_pdf_writer.add_page(pageObj) # Save the cropped PDF to a new file with open('cropped_image.pdf', 'wb') as cropped_pdf_file: cropped_pdf_writer.write(cropped_pdf_file)
# Create a function to convert the PDF to images defconvert_to_images(input_file,): images = convert_from_path(input_file) image = images[0] output_file = "PDF_image.png" image.save(output_file, "PNG")
# Create a function to read text from images defimage_to_text(image_path): # Read the image img = Image.open(image_path) # Extract the text from the image text = pytesseract.image_to_string(img) return text