from flask import Flask, Response, request, send_file, jsonify, send_from_directory
import pdfplumber
import pandas as pd
import camelot
import os
import io
import fitz  # PyMuPDF
import os
import re
import tempfile
import base64
import pdfplumber
import pandas as pd
from PIL import Image
from flask_cors import CORS
from openpyxl import Workbook, load_workbook
from openpyxl.drawing.image import Image as OpenPyXLImage
import xlrd
from openpyxl import Workbook

app = Flask(__name__)
CORS(app)

# Function to remove illegal characters
def remove_illegal_characters(value):
    if isinstance(value, str):
        return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', value)
    return value

# Function to check if the image is completely black
def is_image_black(image):
    grayscale_image = image.convert('L')
    histogram = grayscale_image.histogram()
    return histogram[0] == sum(histogram)

def extract_text_in_rectangle(pdf_path):
    """
    Highlights the area of the PDF starting with 'Ship To' and extracts the text inside the rectangle.
    """
    # Open the PDF
    pdf_document = fitz.open(pdf_path)
    extracted_text = ""

    for page_number, page in enumerate(pdf_document):
        # Search for "Ship To"
        text_instances = page.search_for("Ship To")

        if text_instances:
            for rect in text_instances:
                # Expand the rectangle to include the address area below "Ship To"
                expanded_rect = rect + (0, 5, 220, 45)  # Adjust dimensions for address
                # Extract text inside the expanded rectangle
                extracted_text = page.get_text("text", clip=expanded_rect)
                break  # Stop after finding the first "Ship To"

            break  # Process only the first relevant page

    pdf_document.close()
    return extracted_text.strip()

def convert_xls_to_xlsx(xls_path, xlsx_path):
    xls_book = xlrd.open_workbook(xls_path)
    xlsx_book = Workbook()
    for i in range(xls_book.nsheets):
        xls_sheet = xls_book.sheet_by_index(i)
        if i == 0:
            sheet = xlsx_book.active
            sheet.title = xls_sheet.name
        else:
            sheet = xlsx_book.create_sheet(title=xls_sheet.name)

        for row in range(xls_sheet.nrows):
            sheet.append(xls_sheet.row_values(row))
    xlsx_book.save(xlsx_path)

@app.route('/')
def home():
    return jsonify({'message': 'Welcome to the PDF to Excel converter API!'})


@app.route('/upload', methods=['POST'])
def upload_file():
    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400
    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400
    if file:
        # Ensure the 'uploads' directory exists
        if not os.path.exists('uploads'):
            os.makedirs('uploads')

        # Save the uploaded PDF file
        pdf_path = os.path.join('uploads', file.filename)
        file.save(pdf_path)

        # Extract tables and save to Excel
        excel_path = pdf_path.replace('.pdf', '.xlsx')
        try:
            with pdfplumber.open(pdf_path) as pdf:
                all_tables = []
                for page_num, page in enumerate(pdf.pages, start=1):
                    tables = page.extract_tables()
                    for table in tables:
                        df = pd.DataFrame(table)
                        all_tables.append(df)

            combined_df = pd.concat(all_tables, ignore_index=True)

            # Drop the 'Page' column if it exists
            if 'Page' in combined_df.columns:
                combined_df = combined_df.drop(columns=['Page'])

            combined_df.to_excel(excel_path, index=False)

            return send_file(excel_path, as_attachment=True)

        except Exception as e:
            return jsonify({'error': str(e)}), 500


@app.route('/upload_invoice', methods=['POST'])
def upload_file_camelot():
    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400
    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400
    if file:
        # Ensure the 'uploads' directory exists
        if not os.path.exists('uploads'):
            os.makedirs('uploads')

        # Save the uploaded PDF file
        pdf_path = os.path.join('uploads', file.filename)
        file.save(pdf_path)

        # Extract tables and save to Excel
        excel_path = pdf_path.replace('.pdf', '.xlsx')
        try:
            # Define the table area and two sets of column coordinates
            table_area = '0,850,600,0'
            columns_options = [
                ['0,100,200,300,360,382.5,410,500,600'],
                ['0,100,200,300,360,400,500,600'],
            ]

            best_df = None

            for columns in columns_options:
                # Try extracting tables with the current set of coordinates
                tables = camelot.read_pdf(pdf_path, flavor='stream', pages='all', table_areas=[table_area],
                                          columns=columns)

                # Combine tables into a single DataFrame
                combined_df = pd.concat([table.df for table in tables], ignore_index=True)

                # Check if this extraction resulted in a valid table
                if not combined_df.empty:
                    best_df = combined_df
                    break  # Stop if we find a valid extraction

            if best_df is None:
                return jsonify({'error': 'No valid tables found with the provided coordinates'}), 404

            # Save the best DataFrame to an Excel file
            with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
                best_df.to_excel(writer, sheet_name='All_Data', index=False)

            return send_file(excel_path, as_attachment=True)

        except Exception as e:
            return jsonify({'error': str(e)}), 500

@app.route('/digital_trim_card_ck', methods=['POST'])
def extract_tables():
    if 'file' not in request.files:
        return jsonify({"error": "No file part"}), 400

    file = request.files['file']

    if file.filename == '':
        return jsonify({"error": "No selected file"}), 400

    # Save the uploaded file to a temporary location
    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
        file.save(temp_pdf.name)
        pdf_path = temp_pdf.name

    # Specify the output Excel file path
    excel_path = pdf_path.replace('.pdf', '.xlsx')

    try:
        # Open the PDF file and extract tables
        data = []
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages, start=1):
                tables = page.extract_tables()
                if not tables:
                    text = page.extract_text()
                    if text:
                        rows = [line.split() for line in text.split('\n') if line.strip()]
                        for row in rows:
                            data.append(row)
                else:
                    for table in tables:
                        data.append(table[0])
                        for row in table[1:]:
                            data.append(row)

        # Create a DataFrame from the collected data
        if data:
            combined_df = pd.DataFrame(data)

            # Clean the combined DataFrame
            combined_df = combined_df.apply(lambda x: x.map(remove_illegal_characters) if x.dtype == 'object' else x)

            # Create a new header row with numbers (starting from 0)
            number_header = [f'{i}' for i in range(len(combined_df.columns))]
            combined_df.columns = number_header  # Set the new header

            # Insert a new column at the beginning for row numbers
            combined_df.insert(0, 'Row Number', range(1, len(combined_df) + 1))  # Add row numbers starting from 1

            # Write the combined DataFrame to a single sheet in the Excel file
            combined_df.to_excel(excel_path, index=False, header=True)  # Ensure headers are included

        # Image extraction logic
        image_output_dir = os.path.join(os.path.dirname(pdf_path), 'images')
        os.makedirs(image_output_dir, exist_ok=True)

        # Define the regex pattern to search for
        pattern = r'Approved \d{2}/\d{2}/\d{4}, \d{1,2}:\d{2} [APM]{2}'
        min_width = 800  # Example minimum width
        min_height = 600  # Example minimum height
        best_image = None
        best_image_filename = ''
        best_image_dimensions = (0, 0)

        # Loop through each page using PyMuPDF for image extraction
        pdf_file = fitz.open(pdf_path)
        try:
            for page_num in range(len(pdf_file)):
                page = pdf_file[page_num]
                text = page.get_text()
                if re.search(pattern, text):
                    image_list = page.get_images(full=True)

                    for img_index, img in enumerate(image_list):
                        xref = img[0]
                        base_image = pdf_file.extract_image(xref)
                        image_bytes = base_image["image"]
                        image_ext = base_image["ext"]

                        try:
                            image = Image.open(io.BytesIO(image_bytes))
                            if image.width >= min_width and image.height >= min_height:
                                if not is_image_black(image):
                                    if (image.width * image.height) > (best_image_dimensions[0] * best_image_dimensions[1]):
                                        best_image = image
                                        best_image_filename = os.path.join(image_output_dir, f'best_image.{image_ext}')
                                        best_image_dimensions = (image.width, image.height)
                        except Exception as e:
                            print(f'Error processing image from page {page_num + 1}, image {img_index + 1}: {e}')
        finally:
            # Ensure the PDF file is properly closed
            pdf_file.close()

        # Save the best image if found
        if best_image:
            best_image.save(best_image_filename)
            print(f'Saved the best image: {best_image_filename}')
        else:
            print('No suitable images found.')

        # Add image to the Excel file in a new sheet named "Images"
        if best_image_filename:
            wb = load_workbook(excel_path)
            ws_image = wb.create_sheet("Images")
            img = OpenPyXLImage(best_image_filename)
            ws_image.add_image(img, 'A1')  # Place the image at cell A1
            wb.save(excel_path)

        # Encode Excel file to Base64
        with open(excel_path, "rb") as excel_file:
            excel_base64 = base64.b64encode(excel_file.read()).decode('utf-8')

        # Encode the best image to Base64
        image_base64 = None
        if best_image_filename:
            with open(best_image_filename, "rb") as image_file:
                image_base64 = base64.b64encode(image_file.read()).decode('utf-8')

        # Cleanup temporary files
        os.remove(pdf_path)
        os.remove(excel_path)
        if best_image_filename and os.path.exists(best_image_filename):
            os.remove(best_image_filename)

        # Send the Excel and image as Base64 in JSON
        return jsonify({
            "excel_base64": excel_base64,
            "image_base64": image_base64
        })

    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route('/tables_data', methods=['POST'])
def tables_data():
    if 'file' not in request.files:
        return Response("No file part", status=400)

    file = request.files['file']

    if file.filename == '':
        return Response("No selected file", status=400)

    # Save the uploaded file to a temporary location
    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
        file.save(temp_pdf.name)
        pdf_path = temp_pdf.name

    try:
        # Extract the "Ship To" text
        ship_to_text = extract_text_in_rectangle(pdf_path)

        # Extract table data
        table_data = []
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages, start=1):
                tables = page.extract_tables()
                if not tables:
                    text = page.extract_text()
                    if text:
                        rows = [line.split() for line in text.split('\n') if line.strip()]
                        for row in rows:
                            table_data.append('\t'.join(cell if cell else '-' for cell in row))
                else:
                    for table in tables:
                        for row in table:
                            table_data.append('\t'.join(cell if cell else '-' for cell in row))

        # Combine all data into a single raw response
        response_text = f"Ship To Text:\n{ship_to_text}\n\nExtracted Table Data:\n"
        response_text += '\n'.join(table_data)

        # Cleanup temporary file
        os.remove(pdf_path)

        return Response(response_text, mimetype="text/plain")

    except Exception as e:
        return Response(str(e), status=500)
        
@app.route('/upload-tech-pack', methods=['POST'])
def upload_tech_pack():
    # Check if the file is part of the request
    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400
    
    file = request.files['file']
    # Check if a file was selected
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400
    
    if file:
        try:
            # Ensure the 'uploads' directory exists
            upload_dir = 'uploads'
            if not os.path.exists(upload_dir):
                os.makedirs(upload_dir)

            # Save the uploaded file to the 'uploads' directory
            pdf_path = os.path.join(upload_dir, file.filename)
            file.save(pdf_path)

            # Initialize an empty DataFrame to collect all tables
            all_tables_df = pd.DataFrame()

            # Extract tables from the PDF
            with pdfplumber.open(pdf_path) as pdf:
                for page_number, page in enumerate(pdf.pages, start=1):
                    # Extract tables using PDFPlumber's auto table detection
                    tables = page.extract_tables()
                    
                    for idx, table in enumerate(tables):
                        # Convert the detected table to a DataFrame
                        df = pd.DataFrame(table)
                        # Concatenate the DataFrame into the master DataFrame
                        all_tables_df = pd.concat([all_tables_df, df], ignore_index=True)

            # Save the consolidated tables to an Excel file
            output_dir = 'outputs'
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            excel_path = os.path.join(output_dir, 'output.xlsx')
            all_tables_df.to_excel(excel_path, index=False, header=False)

            # Send the Excel file as a response
            return send_file(excel_path, as_attachment=True)

        except Exception as e:
            # Handle exceptions and return an error response
            return jsonify({'error': str(e)}), 500
              
@app.route('/outputs/images/<filename>')
def serve_image(filename):
    image_folder = os.path.join(app.root_path, 'outputs', 'images')
    return send_from_directory(image_folder, filename)

@app.route('/get-tables-images', methods=['POST'])
def get_tables_images():
    # Check if the file is part of the request
    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400

    file = request.files['file']
    # Check if a file was selected
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    if file:
        try:
            # Ensure the 'uploads' directory exists
            upload_dir = 'uploads'
            if not os.path.exists(upload_dir):
                os.makedirs(upload_dir)

            # Save the uploaded file to the 'uploads' directory
            pdf_path = os.path.join(upload_dir, file.filename)
            file.save(pdf_path)

            # Initialize an empty DataFrame to collect all tables
            all_tables_df = pd.DataFrame()

            # Extract tables from the PDF using pdfplumber
            with pdfplumber.open(pdf_path) as pdf:
                for page_number, page in enumerate(pdf.pages, start=1):
                    # Extract tables using PDFPlumber's auto table detection
                    tables = page.extract_tables()

                    for idx, table in enumerate(tables):
                        # Convert the detected table to a DataFrame
                        df = pd.DataFrame(table)
                        # Concatenate the DataFrame into the master DataFrame
                        all_tables_df = pd.concat([all_tables_df, df], ignore_index=True)

            # Set up the output directories
            output_dir = 'outputs'
            output_images_dir = os.path.join(output_dir, 'images')

            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            os.makedirs(output_images_dir, exist_ok=True)

            # Extract the image from the first page using fitz (PyMuPDF)
            doc = fitz.open(pdf_path)
            first_page = doc[0]

            # Prepare to store extracted image paths
            image_paths = []

            for img_index, img in enumerate(first_page.get_images(full=True)):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
                image_path = os.path.join(output_images_dir, f"image_{img_index + 1}.{image_ext}")

                # Save image to file
                with open(image_path, "wb") as img_file:
                    img_file.write(image_bytes)

                image_paths.append(f"outputs/images/image_{img_index + 1}.{image_ext}")

            excel_path = os.path.join(output_dir, 'output.xlsx')

            with pd.ExcelWriter(excel_path, engine='xlsxwriter') as writer:
                # Write tables to the first sheet
                all_tables_df.to_excel(writer, sheet_name='Tables', index=False, header=False)

                # Write image paths to the second sheet
                if image_paths:
                    image_df = pd.DataFrame({'Image Paths': image_paths})
                    image_df.to_excel(writer, sheet_name='Images', index=False)

                    # Embed images in the Excel file
                    workbook = writer.book
                    worksheet = writer.sheets['Images']
                    for idx, img_path in enumerate(image_paths):
                        worksheet.insert_image(f"B{idx + 2}", img_path)

            # Send the Excel file as a response
            return send_file(excel_path, as_attachment=True)

        except Exception as e:
            # Handle exceptions and return an error response
            return jsonify({'error': str(e)}), 500
        
@app.route('/coleman-excel-api', methods=['POST'])
def sampling_excel_extraction():
    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    try:
        upload_dir = 'uploads'
        os.makedirs(upload_dir, exist_ok=True)
        file_path = os.path.join(upload_dir, file.filename)
        file.save(file_path)

        # Load the workbook
        workbook = load_workbook(file_path, data_only=True)
        sheet_names = workbook.sheetnames
        sheet_data = {}

        for sheet_name in sheet_names:
            sheet = workbook[sheet_name]
            rows = list(sheet.iter_rows(values_only=True))

            if not rows:
                sheet_data[sheet_name] = []
                continue

            headers = [cell if cell is not None else f"{i}" for i, cell in enumerate(rows[0])]
            rows_data = []

            for row in rows[1:]:
                row_dict = {
                    headers[i]: str(row[i]) if i < len(row) and row[i] is not None else ""
                    for i in range(len(headers))
                }
                rows_data.append(row_dict)

            sheet_data[sheet_name] = rows_data

        return jsonify({
            'sheetCount': len(sheet_names),
            'sheets': sheet_data
        }), 200

    except Exception as e:
        return jsonify({'error': str(e)}), 500
    
@app.route('/lodestar-excel-api', methods=['POST'])
def sampling_excel_extraction_lodestar():
    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    try:
        upload_dir = 'uploads'
        os.makedirs(upload_dir, exist_ok=True)
        file_path = os.path.join(upload_dir, file.filename)
        file.save(file_path)

        sheet_data = {}
        extension = os.path.splitext(file.filename)[1].lower()

        if extension == '.xlsx':
            workbook = load_workbook(file_path, data_only=True)
            for sheet_name in workbook.sheetnames:
                sheet = workbook[sheet_name]
                data = []
                for row in sheet.iter_rows(values_only=False):
                    if all(cell.value is None for cell in row):
                        continue  # skip empty rows
                    row_dict = {}
                    for i, cell in enumerate(row):
                        val = cell.value
                        if val is not None and cell.is_date:
                            val = cell.value.strftime('%d-%b-%y')
                        elif isinstance(val, float):
                            # If it might be a date (like 45599.0), convert only if .is_date is false
                            if cell.number_format.lower().startswith('d') or 'yy' in cell.number_format.lower():
                                try:
                                    val = from_excel(val).strftime('%d-%b-%y')
                                except:
                                    pass
                        row_dict[f"{i+1}"] = str(val).strip() if val is not None else ""
                    data.append(row_dict)
                sheet_data[sheet_name] = data

        elif extension == '.xls':
            workbook = xlrd.open_workbook(file_path)
            for sheet_name in workbook.sheet_names():
                sheet = workbook.sheet_by_name(sheet_name)
                data = []
                for row_idx in range(sheet.nrows):
                    row = sheet.row(row_idx)
                    if all(cell.ctype == xlrd.XL_CELL_EMPTY for cell in row):
                        continue  # skip empty rows
                    row_dict = {}
                    for col_idx, cell in enumerate(row):
                        val = cell.value
                        if cell.ctype == xlrd.XL_CELL_DATE:
                            try:
                                val = xlrd.xldate_as_datetime(val, workbook.datemode).strftime('%d-%b-%y')
                            except:
                                val = str(val)
                        row_dict[f"{col_idx+1}"] = str(val).strip() if val else ""
                    data.append(row_dict)
                sheet_data[sheet_name] = data

        else:
            return jsonify({'error': 'Unsupported file format'}), 400

        return jsonify({
            'sheetCount': len(sheet_data),
            'sheets': sheet_data
        }), 200

    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/msapparel-api', methods=['POST'])
def msapparel_api():
    # Check for uploaded file
    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    if file:
        try:
            upload_dir = 'uploads'
            os.makedirs(upload_dir, exist_ok=True)
            pdf_path = os.path.join(upload_dir, file.filename)
            file.save(pdf_path)

            all_tables = []

            try:
                camelot_tables_lattice = camelot.read_pdf(pdf_path, pages='all', flavor='lattice', strip_text='\n')
                camelot_tables_stream = camelot.read_pdf(pdf_path, pages='all', flavor='stream', strip_text='\n')

                for table in camelot_tables_lattice:
                    df = table.df
                    all_tables.append(df)

                for table in camelot_tables_stream:
                    df = table.df
                    all_tables.append(df)

            except Exception as ce:
                print("⚠️ Camelot extraction error:", ce)

            try:
                with pdfplumber.open(pdf_path) as pdf:
                    for page_number, page in enumerate(pdf.pages, start=1):
                        tables = page.extract_tables()
                        for idx, table in enumerate(tables):
                            df = pd.DataFrame(table)
                            all_tables.append(df)
            except Exception as pe:
                print("⚠️ pdfplumber extraction error:", pe)

            output_dir = 'outputs'
            output_images_dir = os.path.join(output_dir, 'images')
            os.makedirs(output_dir, exist_ok=True)
            os.makedirs(output_images_dir, exist_ok=True)

            excel_path = os.path.join(output_dir, 'output_combined.xlsx')

            if all_tables:
                combined_df = pd.concat(all_tables, ignore_index=True)
                combined_df = combined_df.fillna("")

                with pd.ExcelWriter(excel_path, engine='xlsxwriter') as writer:
                    combined_df.to_excel(writer, sheet_name='All_Tables', index=False, header=False)

                    doc = fitz.open(pdf_path)
                    image_paths = []
                    for page_num in range(len(doc)):
                        page = doc[page_num]
                        for img_index, img in enumerate(page.get_images(full=True)):
                            xref = img[0]
                            base_image = doc.extract_image(xref)
                            image_bytes = base_image["image"]
                            image_ext = base_image["ext"]
                            image_path = os.path.join(
                                output_images_dir,
                                f"page{page_num+1}_image{img_index+1}.{image_ext}"
                            )
                            with open(image_path, "wb") as img_file:
                                img_file.write(image_bytes)
                            image_paths.append(image_path)

                    if image_paths:
                        image_df = pd.DataFrame({'Image Paths': image_paths})
                        image_df.to_excel(writer, sheet_name='Images', index=False)

                        workbook = writer.book
                        worksheet = writer.sheets['Images']
                        for idx, img_path in enumerate(image_paths):
                            try:
                                worksheet.insert_image(f"B{idx + 2}", img_path,
                                                       {'x_scale': 0.4, 'y_scale': 0.4})
                            except Exception as e:
                                print(f"⚠️ Could not embed image {img_path}: {e}")

            else:
                pd.DataFrame([["No tables detected in document"]]).to_excel(excel_path, index=False, header=False)

            return send_file(excel_path, as_attachment=True)

        except Exception as e:
            return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    if not os.path.exists('uploads'):
        os.makedirs('uploads')
    app.run(debug=True,host='165.22.220.143',port='2000')
    #app.run(debug=True,host='localhost',port='2000')