python-pdf2txt/pdf_to_word.py


								import logging

								from pdf2image import convert_from_path

								import pytesseract

								from PIL import Image

								import tempfile

								import os

								from docx import Document

								from datetime import datetime


								# Configure logging

								logging.basicConfig(level=logging.INFO, filename='/app/logs/pdf_to_word.log', filemode='w',

								                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')


								def pdf_to_word_txt(pdf_path, output_dir, lang='fas+eng'):

								    pdf_name = os.path.basename(pdf_path).split('.')[0]


								    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")

								    output_path = os.path.join(output_dir, f'{pdf_name}_{current_time}.txt')


								    try:

								        pages = convert_from_path(pdf_path, 600)

								        logging.info(f'Converted PDF to images, number of pages: {len(pages)}')

								        all_text = []

								        for i, page in enumerate(pages):

								            with tempfile.NamedTemporaryFile(suffix='.jpg') as temp_image:

								                page.save(temp_image.name, 'JPEG')

								                text = pytesseract.image_to_string(Image.open(temp_image.name), lang=lang, config='--oem 3 --psm 3')

								                logging.info("\n------"+f'Page {i+1}'+"------\n"+text+"\n------*------\n")

								                all_text.append(text)


								        with open(output_path, 'w', encoding='utf-8') as f:

								            f.write("\n".join(all_text))


								        logging.info(f'Successfully converted {pdf_path} to text at {output_path}')

								        return output_path

								    except Exception as e:

								        logging.error(f'Error converting PDF to text: {e}', exc_info=True)

								        return None


								def pdf_to_word_docx(pdf_path, output_dir, lang='eng+fas'):

								    pdf_name = os.path.basename(pdf_path).split('.')[0]

								    # output_path = os.path.join(output_dir, f'{pdf_name}.docx')

								    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")

								    output_path = os.path.join(output_dir, f'{pdf_name}_{current_time}.docx')


								    try:

								        # Increase DPI for higher image quality

								        pages = convert_from_path(pdf_path, 300)

								        logging.info(f'Converted PDF to images, number of pages: {len(pages)}')


								        # PSM 4 might be more suitable for mixed content like your provided image

								        custom_config = r'--oem 3 --psm 4'

								        # custom_config = r''

								        all_text = []


								        for i, page in enumerate(pages):

								            with tempfile.NamedTemporaryFile(suffix='.jpg') as temp_image:

								                page.save(temp_image.name, 'JPEG')

								                text = pytesseract.image_to_string(Image.open(temp_image.name), lang=lang, config=custom_config)

								                logging.info("\n------"+f'Page {i+1}'+"------\n"+text+"\n------*------\n")

								                all_text.append(text)


								        # Save to a .docx file using a library like python-docx

								        doc = Document()

								        for text in all_text:

								            doc.add_paragraph(text)

								        doc.save(output_path)


								        logging.info(f'Successfully converted {pdf_path} to a .docx file at {output_path}')

								        return output_path

								    except Exception as e:

								        logging.error(f'Error converting PDF to text: {e}', exc_info=True)

								        return None