You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
76 lines
3.0 KiB
76 lines
3.0 KiB
import logging
|
|
from pdf2image import convert_from_path
|
|
import pytesseract
|
|
from PIL import Image
|
|
import tempfile
|
|
import os
|
|
from docx import Document
|
|
from datetime import datetime
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO, filename='/app/logs/pdf_to_word.log', filemode='w',
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
|
def pdf_to_word_txt(pdf_path, output_dir, lang='fas+eng'):
|
|
pdf_name = os.path.basename(pdf_path).split('.')[0]
|
|
|
|
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
output_path = os.path.join(output_dir, f'{pdf_name}_{current_time}.txt')
|
|
|
|
try:
|
|
pages = convert_from_path(pdf_path, 600)
|
|
logging.info(f'Converted PDF to images, number of pages: {len(pages)}')
|
|
all_text = []
|
|
for i, page in enumerate(pages):
|
|
with tempfile.NamedTemporaryFile(suffix='.jpg') as temp_image:
|
|
page.save(temp_image.name, 'JPEG')
|
|
text = pytesseract.image_to_string(Image.open(temp_image.name), lang=lang, config='--oem 3 --psm 3')
|
|
logging.info("\n------"+f'Page {i+1}'+"------\n"+text+"\n------*------\n")
|
|
all_text.append(text)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write("\n".join(all_text))
|
|
|
|
logging.info(f'Successfully converted {pdf_path} to text at {output_path}')
|
|
return output_path
|
|
except Exception as e:
|
|
logging.error(f'Error converting PDF to text: {e}', exc_info=True)
|
|
return None
|
|
|
|
|
|
|
|
|
|
def pdf_to_word_docx(pdf_path, output_dir, lang='eng+fas'):
|
|
pdf_name = os.path.basename(pdf_path).split('.')[0]
|
|
# output_path = os.path.join(output_dir, f'{pdf_name}.docx')
|
|
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
output_path = os.path.join(output_dir, f'{pdf_name}_{current_time}.docx')
|
|
|
|
try:
|
|
# Increase DPI for higher image quality
|
|
pages = convert_from_path(pdf_path, 300)
|
|
logging.info(f'Converted PDF to images, number of pages: {len(pages)}')
|
|
|
|
# PSM 4 might be more suitable for mixed content like your provided image
|
|
custom_config = r'--oem 3 --psm 4'
|
|
# custom_config = r''
|
|
all_text = []
|
|
|
|
for i, page in enumerate(pages):
|
|
with tempfile.NamedTemporaryFile(suffix='.jpg') as temp_image:
|
|
page.save(temp_image.name, 'JPEG')
|
|
text = pytesseract.image_to_string(Image.open(temp_image.name), lang=lang, config=custom_config)
|
|
logging.info("\n------"+f'Page {i+1}'+"------\n"+text+"\n------*------\n")
|
|
all_text.append(text)
|
|
|
|
# Save to a .docx file using a library like python-docx
|
|
doc = Document()
|
|
for text in all_text:
|
|
doc.add_paragraph(text)
|
|
doc.save(output_path)
|
|
|
|
logging.info(f'Successfully converted {pdf_path} to a .docx file at {output_path}')
|
|
return output_path
|
|
except Exception as e:
|
|
logging.error(f'Error converting PDF to text: {e}', exc_info=True)
|
|
return None
|