You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

46 lines
1.8 KiB

  1. import logging
  2. from pdf2image import convert_from_path
  3. import pytesseract
  4. from PIL import Image
  5. import tempfile
  6. from docx import Document
  7. from docx.enum.text import WD_ALIGN_PARAGRAPH
  8. import os
  9. # Configure logging
  10. logging.basicConfig(level=logging.DEBUG, filename='/app/logs/pdf_to_word.log', filemode='w',
  11. format='%(name)s - %(levelname)s - %(message)s')
  12. def pdf_to_word(pdf_path, output_dir, lang='eng+fas'):
  13. pdf_name = os.path.basename(pdf_path).split('.')[0]
  14. output_path = os.path.join(output_dir, f'{pdf_name}.docx')
  15. logging.info(f'Starting conversion for {pdf_path}')
  16. try:
  17. pages = convert_from_path(pdf_path, 300)
  18. logging.info(f'Converted PDF to images, number of pages: {len(pages)}')
  19. document = Document()
  20. document.add_heading('Document Title', 0)
  21. for i, page in enumerate(pages):
  22. with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_image:
  23. page.save(temp_image.name, 'JPEG')
  24. text = pytesseract.image_to_string(Image.open(temp_image.name), lang=lang, config='--psm 6')
  25. logging.debug(f'Extracted text from page {i+1}')
  26. heading = document.add_heading(f'Page {i+1}', level=1)
  27. heading.alignment = WD_ALIGN_PARAGRAPH.RIGHT
  28. paragraph = document.add_paragraph(text)
  29. paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
  30. logging.info("\n------"+f'Page {i+1}'+"------\n"+text+"\n------*------\n")
  31. os.unlink(temp_image.name)
  32. document.save(output_path)
  33. logging.info(f'Document saved to {output_path}')
  34. return output_path
  35. except Exception as e:
  36. logging.error(f'An error occurred: {e}', exc_info=True)
  37. return None