Why my python code is not able to read both table’s column headers properly in excel file?

  Kiến thức lập trình

I have a python program that is supposed to read an Excel file and do some processing. Below is the whole code.

import pandas as pd
import string
import random
import logging
import io
import boto3
import base64


if len(logging.getLogger().handlers) > 0:
    # The Lambda environment pre-configures a handler logging to stderr.
    # If a handler is already configured,
    # `.basicConfig` does not execute. Thus we set the level directly.
    logging.getLogger().setLevel(logging.INFO)
else:
    logging.basicConfig(level=logging.INFO)


if len(logging.getLogger().handlers) > 0:
    # The Lambda environment pre-configures a handler logging to stderr.
    # If a handler is already configured,
    # `.basicConfig` does not execute. Thus we set the level directly.
    logging.getLogger().setLevel(logging.INFO)
else:
    logging.basicConfig(level=logging.INFO)


def read_data_between_headers(df, target_headers):
    repeat_indices = df[df.apply(lambda row: all(cell in row.values for cell in target_headers), axis=1)].index

    if len(repeat_indices) > 1:
        data_between_headers = df.iloc[repeat_indices[0] + 1:repeat_indices[-1]]
        data_between_headers = data_between_headers.dropna(how='all').iloc[:-1]
        data_between_headers = data_between_headers.dropna(subset=target_headers, how='all')

        data_after_last_occurrence = df.iloc[repeat_indices[-1] + 1:]
        data_after_last_occurrence = data_after_last_occurrence.dropna(how='all')
        data_after_last_occurrence = data_after_last_occurrence.dropna(subset=target_headers, how='all')

        logging.info("read data between headers")
        logging.info("Columns in data_between_headers: %s", data_between_headers.columns)
        print("******************", data_between_headers, data_after_last_occurrence)
        return data_between_headers, data_after_last_occurrence

    else:
        logging.info("no data between headers")
        return pd.DataFrame(), pd.DataFrame()


def random_alphanumeric(length, hyphen_interval=4):
    characters = string.ascii_letters + string.digits
    random_value = ''.join(random.choice(characters) for _ in range(length))
    return '-'.join(random_value[i:i + hyphen_interval] for i in range(0, len(random_value), hyphen_interval))

def process_excel(df, target_headers):
    resulting_data = read_data_between_headers(df, target_headers)
    data_between_headers, data_after_last_occurrence = resulting_data
    print(data_between_headers, data_after_last_occurrence)
    print("Before conversion:", data_between_headers['Value'])
    data_between_headers['Val'] = pd.to_numeric(data_between_headers['Val'], errors='coerce')
    
    print("After conversion:", data_between_headers['Val'])
    data_between_headers['Type'] = 'GLA'
    data_after_last_occurrence['Val'] = pd.to_numeric(data_after_last_occurrence['Val'], errors='coerce')
    data_after_last_occurrence['Type'] = 'GLA'
    logging.info("process excel worked")

    updated_rows = []

    for index, row in data_between_headers.iterrows():
        updated_rows.append(dict(row))
        if row['Value'] != 0:
            new_row = dict(row)
            new_row['Val'] = -row['Val']
            new_row['ID'] = random_alphanumeric(16, hyphen_interval=4)
            new_row['GLA'] = '2100'

            for col in data_between_headers.columns:
                if col not in ['ID', 'Comp', 'Type', 'Jou', 'Val', 'GLA']
                    new_row[col] = ''
            updated_rows.append(new_row)
            print(updated_rows)

    updated_rows.extend([{}, {}])
    updated_rows.append({key: key for key in data_between_headers.columns})

    for index, row in data_after_last_occurrence.iterrows():
        updated_rows.append(dict(row))
        if row['Value'] != 0:
            new_row = dict(row)
            new_row['Val'] = -row['Val']
            new_row['ID'] = random_alphanumeric(16, hyphen_interval=4)
            new_row['GLA'] = '2100'

            for col in data_after_last_occurrence.columns:
                if col not in ['ID', 'Comp', 'Type', 'Jou', 'Val', 'GLA']
                    new_row[col] = ''
            updated_rows.append(new_row)

    updated_df = pd.DataFrame(updated_rows, columns=data_between_headers.columns)
    logging.info("processed data successfully")

    return updated_df

def read_excel_from_s3(file_path):
    logging.info("Reading cheatsheet from S3.")
    try:
        bucket_name = 'kloo-approved-invoice-integration'
        object_key = file_path
        s3 = boto3.client('s3')
        logging.info(f"Bucket={bucket_name}, Key={object_key}")
        obj = s3.get_object(Bucket=bucket_name, Key=object_key)
        data = obj['Body'].read()
        print(data)
        df = pd.read_excel(io.BytesIO(data))
    except Exception as error:
        logging.error(f"Something went wrong with reading Excel file from S3: {file_path}")
        df = pd.DataFrame()  # Return an empty DataFrame in case of an error
    return df

def write_excel_with_specific_columns(df, output_file_path):
    logging.info(f"Writing DataFrame to {output_file_path} with specific columns.")
    try:
        with io.BytesIO() as output:
            with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                df.to_excel(writer, index=False)  # Set index=False to exclude the index column
            data = output.getvalue()

        s3 = boto3.resource('s3')
        s3.Bucket('kloo-approved-invoice-integration').put_object(Key='stage/test2.xlsx', Body=data)
        logging.info(f"DataFrame successfully written to {output_file_path}.")
    except Exception as error:
        logging.error(f"Something went wrong with writing DataFrame to Excel file: {output_file_path}")

def lambda_handler(event, context):
    try:
        query_string_parameters = event['queryStringParameters']
        xl_input_file = query_string_parameters.get('xl_input_file')
        # xl_input_file = event.get('xl_input_file') or os.environ.get("XL_INPUT_FILE")

        if xl_input_file:
            print(f"Excel file to process: {xl_input_file}")

            target_headers = ['ID', 'Comp', 'Type', 'Jou', 'Val', 'GLA']

            # Read the Excel file from S3
            excel_df = read_excel_from_s3(xl_input_file)
            # logging.info(excel_df)

            # result_df = process_excel(xl_input_file, target_headers)
            result_df = process_excel(excel_df, target_headers)
            # logging.info(result_df)

            # Specify the output file path
            # output_file_path = 'kloo-approved-invoice-integration/stage/test2.xlsx'
            encoded_data = base64.b64encode(result_df, validate=True)
            logging.info(encoded_data)

            # Write the processed DataFrame to a new Excel file with specific columns
            write_excel_with_specific_columns(result_df, 'stage/test2.xlsx')

            # # Write DataFrame to a new Excel file with specific columns
            # write_excel_with_specific_columns(excel_df, 'kloo-approved-invoice-integration/stage/test2.xlsx', target_headers)

            return {
                'statusCode': 200,
                'body': 'Process completed successfully',
                'result_df': encoded_data # Convert DataFrame to a JSON-serializable format
            }
        else:
            print("No Excel file provided.")
            return {
                'statusCode': 400,
                'body': 'No Excel file provided'
            }

    except Exception as e:
        print(f"Error: {str(e)}")
        return {
            'statusCode': 500,
            'body': f"Error: {str(e)}"
        }

Error: ['ID', 'Comp', 'Type', 'Jou', 'Val', 'GLA']

The expected behavior of the code is to read headers and data of both the tables in an Excel file and process the file. The code is also supposed to ignore blank rows for both the tables in the Excel file.

Below is the error.

x1bx0bxf8x0exb9x00x00x00xc9x00x00x00x18x00x00x00x00x00x00x00x00x00x00x00x00x00sLx00x00customXml/itemProps3.xmlPKx01x02-x00x14x00x06x00x08x00x00x00!x00t?9zxc2x00x00x00(x01x00x00x1ex00x00x00x00x00x00x00x00x00x00x00x00x00x8aMx00x00customXml/_rels/item1.xml.relsPKx01x02-x00x14x00x06x00x08x00x00x00!x00\x96'"xc3x00x00x00(x01x00x00x1ex00x00x00x00x00x00x00x00x00x00x00x00x00x90Ox00x00customXml/_rels/item2.xml.relsPKx01x02-x00x14x00x06x00x08x00x00x00!x00{xf3x02xa3xc3x00x00x00(x01x00x00x1ex00x00x00x00x00x00x00x00x00x00x00x00x00x97Qx00x00customXml/_rels/item3.xml.relsPKx05x06x00x00x00x00x17x00x17x00x0fx06x00x00x9eSx00x00x00x00'


Error: ['ID', 'Comp', 'Type', 'Jou', 'Val', 'GLA']
1. If I understand correctly, the code is considering the column names as data, not headers.
2. The Excel file has two tables and both tables have the same sets of column names(headers).

Please suggest.

LEAVE A COMMENT