You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

76 lines
3.0 KiB

8 months ago
8 months ago
8 months ago
8 months ago
8 months ago
8 months ago
8 months ago
8 months ago
8 months ago
8 months ago
8 months ago
8 months ago
  1. import logging
  2. from pdf2image import convert_from_path
  3. import pytesseract
  4. from PIL import Image
  5. import tempfile
  6. import os
  7. from docx import Document
  8. from datetime import datetime
  9. # Configure logging
  10. logging.basicConfig(level=logging.INFO, filename='/app/logs/pdf_to_word.log', filemode='w',
  11. format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  12. def pdf_to_word_txt(pdf_path, output_dir, lang='fas+eng'):
  13. pdf_name = os.path.basename(pdf_path).split('.')[0]
  14. current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
  15. output_path = os.path.join(output_dir, f'{pdf_name}_{current_time}.txt')
  16. try:
  17. pages = convert_from_path(pdf_path, 600)
  18. logging.info(f'Converted PDF to images, number of pages: {len(pages)}')
  19. all_text = []
  20. for i, page in enumerate(pages):
  21. with tempfile.NamedTemporaryFile(suffix='.jpg') as temp_image:
  22. page.save(temp_image.name, 'JPEG')
  23. text = pytesseract.image_to_string(Image.open(temp_image.name), lang=lang, config='--oem 3 --psm 3')
  24. logging.info("\n------"+f'Page {i+1}'+"------\n"+text+"\n------*------\n")
  25. all_text.append(text)
  26. with open(output_path, 'w', encoding='utf-8') as f:
  27. f.write("\n".join(all_text))
  28. logging.info(f'Successfully converted {pdf_path} to text at {output_path}')
  29. return output_path
  30. except Exception as e:
  31. logging.error(f'Error converting PDF to text: {e}', exc_info=True)
  32. return None
  33. def pdf_to_word_docx(pdf_path, output_dir, lang='eng+fas'):
  34. pdf_name = os.path.basename(pdf_path).split('.')[0]
  35. # output_path = os.path.join(output_dir, f'{pdf_name}.docx')
  36. current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
  37. output_path = os.path.join(output_dir, f'{pdf_name}_{current_time}.docx')
  38. try:
  39. # Increase DPI for higher image quality
  40. pages = convert_from_path(pdf_path, 300)
  41. logging.info(f'Converted PDF to images, number of pages: {len(pages)}')
  42. # PSM 4 might be more suitable for mixed content like your provided image
  43. custom_config = r'--oem 3 --psm 4'
  44. # custom_config = r''
  45. all_text = []
  46. for i, page in enumerate(pages):
  47. with tempfile.NamedTemporaryFile(suffix='.jpg') as temp_image:
  48. page.save(temp_image.name, 'JPEG')
  49. text = pytesseract.image_to_string(Image.open(temp_image.name), lang=lang, config=custom_config)
  50. logging.info("\n------"+f'Page {i+1}'+"------\n"+text+"\n------*------\n")
  51. all_text.append(text)
  52. # Save to a .docx file using a library like python-docx
  53. doc = Document()
  54. for text in all_text:
  55. doc.add_paragraph(text)
  56. doc.save(output_path)
  57. logging.info(f'Successfully converted {pdf_path} to a .docx file at {output_path}')
  58. return output_path
  59. except Exception as e:
  60. logging.error(f'Error converting PDF to text: {e}', exc_info=True)
  61. return None