from flask import Flask, request, send_from_directory, jsonify, abort import os import logging import requests from pdf_to_word import pdf_to_word_txt, pdf_to_word_docx from datetime import datetime from threading import Thread # Set up logging logging.basicConfig(level=logging.INFO, filename='/app/logs/app.log', filemode='w', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') # Configure specific logger for potentially noisy libraries or modules watchdog_logger = logging.getLogger('watchdog') watchdog_logger.setLevel(logging.ERROR) # Show only errors from watchdog app = Flask(__name__) UPLOAD_FOLDER = '/app/uploads' OUTPUT_FOLDER = '/app/outputs' CALLBACK_FOLDER = '/app/callbacks' if not os.path.exists(UPLOAD_FOLDER): os.makedirs(UPLOAD_FOLDER) if not os.path.exists(OUTPUT_FOLDER): os.makedirs(OUTPUT_FOLDER) if not os.path.exists(CALLBACK_FOLDER): os.makedirs(CALLBACK_FOLDER) def process_pdf_in_background(pdf_path, output_folder, callback_url=None): try: # Convert PDF to text (assuming you have a function that does this) output_path = pdf_to_word_docx(pdf_path, output_folder) if output_path and callback_url: with open(output_path, 'rb') as f: files = {'file': (os.path.basename(output_path), f)} response = requests.post(callback_url, files=files) response.raise_for_status() except Exception as e: logging.error(f'Background processing error: {e}', exc_info=True) def start_background_task(target, *args): thread = Thread(target=target, args=args) thread.start() @app.route('/upload-pdf', methods=['POST']) def upload_pdf(): file = request.files['file'] callback_url = request.form.get('callback_url') if file and file.filename.endswith('.pdf'): pdf_filename = os.path.splitext(file.filename)[0] + '_' + datetime.now().strftime("%Y%m%d_%H%M%S") + '.pdf' pdf_path = os.path.join(UPLOAD_FOLDER, pdf_filename) file.save(pdf_path) logging.info(f'PDF uploaded and saved to {pdf_path}') # Start the PDF processing in a background thread start_background_task(process_pdf_in_background, pdf_path, OUTPUT_FOLDER, callback_url) # Respond immediately return jsonify({'message': 'File upload accepted, processing started'}), 202 else: logging.warning('Invalid file upload attempt') abort(400, 'Invalid file type or no file uploaded') # @app.route('/upload-pdf', methods=['POST']) # def upload_pdf(): # file = request.files['file'] # callback_url = request.form.get('callback_url') # if file and file.filename.endswith('.pdf'): # pdf_filename = os.path.splitext(file.filename)[0] + '_' + datetime.now().strftime("%Y%m%d_%H%M%S") + '.pdf' # pdf_path = os.path.join(UPLOAD_FOLDER, pdf_filename) # file.save(pdf_path) # logging.info(f'PDF uploaded and saved to {pdf_path}') # # Convert PDF to text # #output_path = pdf_to_word_txt(pdf_path, OUTPUT_FOLDER) # output_path = pdf_to_word_docx(pdf_path, OUTPUT_FOLDER) # if output_path: # logging.info(f'Text file created at {output_path}') # if callback_url: # try: # with open(output_path, 'rb') as f: # files = {'file': (os.path.basename(output_path), f)} # response = requests.post(callback_url, files=files) # response.raise_for_status() # Will raise an HTTPError for bad requests # return jsonify({'message': 'File processed and sent successfully'}), 200 # except requests.exceptions.RequestException as e: # logging.error(f'Failed to send file to callback URL: {e}') # abort(500, 'Failed to send file to callback URL') # else: # return send_from_directory(OUTPUT_FOLDER, os.path.basename(output_path), as_attachment=True) # else: # logging.error('Failed to convert PDF') # abort(500, 'Conversion failed') # else: # logging.warning('Invalid file upload attempt') # abort(400, 'Invalid file type or no file uploaded') @app.route('/callback', methods=['POST']) def callback(): file = request.files['file'] if file: filepath = os.path.join(CALLBACK_FOLDER, file.filename) file.save(filepath) logging.info(f'File received and saved at {filepath}') return jsonify({'message': 'File received and saved successfully'}), 200 else: logging.error('No file received at callback') abort(400, 'No file received') if __name__ == '__main__': app.run(debug=True, host='0.0.0.0', port=5000)