|
|
import logging from pdf2image import convert_from_path import pytesseract from PIL import Image import tempfile import os from docx import Document from datetime import datetime
# Configure logging logging.basicConfig(level=logging.INFO, filename='/app/logs/pdf_to_word.log', filemode='w', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
def pdf_to_word_txt(pdf_path, output_dir, lang='fas+eng'): pdf_name = os.path.basename(pdf_path).split('.')[0]
current_time = datetime.now().strftime("%Y%m%d_%H%M%S") output_path = os.path.join(output_dir, f'{pdf_name}_{current_time}.txt')
try: pages = convert_from_path(pdf_path, 600) logging.info(f'Converted PDF to images, number of pages: {len(pages)}') all_text = [] for i, page in enumerate(pages): with tempfile.NamedTemporaryFile(suffix='.jpg') as temp_image: page.save(temp_image.name, 'JPEG') text = pytesseract.image_to_string(Image.open(temp_image.name), lang=lang, config='--oem 3 --psm 3') logging.info("\n------"+f'Page {i+1}'+"------\n"+text+"\n------*------\n") all_text.append(text) with open(output_path, 'w', encoding='utf-8') as f: f.write("\n".join(all_text))
logging.info(f'Successfully converted {pdf_path} to text at {output_path}') return output_path except Exception as e: logging.error(f'Error converting PDF to text: {e}', exc_info=True) return None
def pdf_to_word_docx(pdf_path, output_dir, lang='eng+fas'): pdf_name = os.path.basename(pdf_path).split('.')[0] # output_path = os.path.join(output_dir, f'{pdf_name}.docx') current_time = datetime.now().strftime("%Y%m%d_%H%M%S") output_path = os.path.join(output_dir, f'{pdf_name}_{current_time}.docx')
try: # Increase DPI for higher image quality pages = convert_from_path(pdf_path, 300) logging.info(f'Converted PDF to images, number of pages: {len(pages)}') # PSM 4 might be more suitable for mixed content like your provided image custom_config = r'--oem 3 --psm 4' # custom_config = r'' all_text = [] for i, page in enumerate(pages): with tempfile.NamedTemporaryFile(suffix='.jpg') as temp_image: page.save(temp_image.name, 'JPEG') text = pytesseract.image_to_string(Image.open(temp_image.name), lang=lang, config=custom_config) logging.info("\n------"+f'Page {i+1}'+"------\n"+text+"\n------*------\n") all_text.append(text) # Save to a .docx file using a library like python-docx doc = Document() for text in all_text: doc.add_paragraph(text) doc.save(output_path) logging.info(f'Successfully converted {pdf_path} to a .docx file at {output_path}') return output_path except Exception as e: logging.error(f'Error converting PDF to text: {e}', exc_info=True) return None
|