ows-master/random/script 2.py

import pandas as pd
from openpyxl import load_workbook
from dateutil import parser
import re


def list_sheets(file):
    try:
        workbook = load_workbook(filename=file, read_only=True)
        sheets = workbook.sheetnames
        return sheets
    except Exception as e:
        print(e)
        raise ValueError(f"Could not open the file '{file}'. Please check the file and try again.")


def dutch_date_parser(date_str):
    # Remove Dutch day names
    day_name_pattern = r'\b(maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)\b'
    date_str = re.sub(day_name_pattern, '', date_str, flags=re.IGNORECASE).strip()

    # Translate Dutch month names to English
    month_translation = {
        'januari': 'January', 'februari': 'February', 'maart': 'March',
        'april': 'April', 'mei': 'May', 'juni': 'June', 'juli': 'July',
        'augustus': 'August', 'september': 'September', 'oktober': 'October',
        'november': 'November', 'december': 'December'
    }

    for dutch_month, english_month in month_translation.items():
        date_str = re.sub(r'\b' + dutch_month + r'\b', english_month, date_str, flags=re.IGNORECASE)

    # Try parsing the modified date string
    try:
        return parser.parse(date_str, dayfirst=True)
    except ValueError:
        return pd.NaT

def compare_roosters(base_file, comparison_file, output_file):
    # Print the sheets available in both Excel files
    base_sheets = list_sheets(base_file)
    comparison_sheets = list_sheets(comparison_file)
    print(f"Sheets in '{base_file}': {base_sheets}")
    print(f"Sheets in '{comparison_file}': {comparison_sheets}")

    # Function to load an Excel file with error handling
    def load_excel(file):
        try:
            df = pd.read_excel(file, engine='openpyxl')
            if df.empty:
                raise ValueError(f"The file '{file}' has no sheets or is empty.")
            return df
        except Exception as e:
            print(e)
            raise ValueError(f"Could not load the file '{file}'. Please check the file and try again.")

    # Load the Excel files
    base_df = load_excel(base_file)
    comparison_df = load_excel(comparison_file)

    # Ensure the columns we need are present in both files
    required_columns = ['Code examenrooster', 'Beginuur S+', 'Datum S+', 'Einduur S+']
    for column in required_columns:
        if column not in base_df.columns or column not in comparison_df.columns:
            raise ValueError(f"Column '{column}' is missing from one of the files")

     # Convert 'Datum S+' in comparison_df to the universal format
    comparison_df['Datum S+'] = comparison_df['Datum S+'].apply(
        lambda x: dutch_date_parser(x) if isinstance(x, str) else x
    )

    # Merge the dataframes on 'Code examenrooster' to compare the rows with matching codes
    merged_df = base_df.merge(
        comparison_df,
        on='Code examenrooster',
        suffixes=('_base', '_comp'),
        how='outer',  # Outer join to capture all differences
        indicator=True  # Indicator to show if the row was in one or both files
    )

    # Create an empty list to store rows with differences
    differences = []

    # Iterate over each row to find discrepancies
    for _, row in merged_df.iterrows():
        row_data = {}
        # Only compare rows that exist in both files
        if row['_merge'] == 'both':
            differences_in_row = []

            # Compare the columns
            for column in ['Beginuur S+', 'Datum S+', 'Einduur S+']:
                base_value = row.get(f"{column}_base", pd.NA)
                comp_value = row.get(f"{column}_comp", pd.NA)

                if pd.isna(base_value) and pd.isna(comp_value):
                    continue  # Skip comparison if both are NaN
                elif base_value != comp_value:
                    differences_in_row.append(f"{column} differs (Base: {base_value}, Comp: {comp_value})")

            # If there are any differences in this row, add them to the differences list
            if differences_in_row:
                for col in required_columns:
                    row_data[col] = row.get(col, pd.NA)
                    row_data[f"{col}_comp"] = row.get(f"{col}_comp", pd.NA)
                row_data['Difference'] = "; ".join(differences_in_row)
                differences.append(row_data)

        elif row['_merge'] == 'left_only':
            differences.append({
                'Code examenrooster': row['Code examenrooster'],
                'Difference': "Row missing in comparison file"
            })
        elif row['_merge'] == 'right_only':
            differences.append({
                'Code examenrooster': row['Code examenrooster'],
                'Difference': "Row missing in base file"
            })

    # Create a DataFrame from the differences list
    differences_df = pd.DataFrame(differences)

    # Save the differences to an Excel file
    differences_df.to_excel(output_file, index=False)

    print(f"Differences saved to {output_file}")


# Example usage:
compare_roosters('afgewerkte.xlsx', 'bages rooster voor s.xlsx', 'differences_output.xlsx')