You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
import logging from pdf2image import convert_from_path import pytesseract from PIL import Image import tempfile from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH import os
# Configure logging logging.basicConfig(level=logging.DEBUG, filename='/app/logs/pdf_to_word.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s')
def pdf_to_word(pdf_path, output_dir, lang='eng+fas'): pdf_name = os.path.basename(pdf_path).split('.')[0] output_path = os.path.join(output_dir, f'{pdf_name}.docx') logging.info(f'Starting conversion for {pdf_path}')
try: pages = convert_from_path(pdf_path, 300) logging.info(f'Converted PDF to images, number of pages: {len(pages)}')
document = Document() document.add_heading('Document Title', 0)
for i, page in enumerate(pages): with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_image: page.save(temp_image.name, 'JPEG') text = pytesseract.image_to_string(Image.open(temp_image.name), lang=lang, config='--psm 6') logging.debug(f'Extracted text from page {i+1}')
heading = document.add_heading(f'Page {i+1}', level=1) heading.alignment = WD_ALIGN_PARAGRAPH.RIGHT paragraph = document.add_paragraph(text) paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT logging.info("\n------"+f'Page {i+1}'+"------\n"+text+"\n------*------\n") os.unlink(temp_image.name)
document.save(output_path) logging.info(f'Document saved to {output_path}') return output_path except Exception as e: logging.error(f'An error occurred: {e}', exc_info=True) return None
|