ows-master/random/script 2.py

import pandas as pd
from openpyxl import load_workbook
from dateutil import parser
import re


def list_sheets(file):
    try:
        workbook = load_workbook(filename=file, read_only=True)
        sheets = workbook.sheetnames
        return sheets
    except Exception as e:
        print(e)
        raise ValueError(f"Could not open the file '{file}'. Please check the file and try again.")


def dutch_date_parser(date_str):
    # Remove Dutch day names
    day_name_pattern = r'\b(maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)\b'
    date_str = re.sub(day_name_pattern, '', date_str, flags=re.IGNORECASE).strip()

    # Translate Dutch month names to English
    month_translation = {
        'januari': 'January', 'februari': 'February', 'maart': 'March',
        'april': 'April', 'mei': 'May', 'juni': 'June', 'juli': 'July',
        'augustus': 'August', 'september': 'September', 'oktober': 'October',
        'november': 'November', 'december': 'December'
    }

    for dutch_month, english_month in month_translation.items():
        date_str = re.sub(r'\b' + dutch_month + r'\b', english_month, date_str, flags=re.IGNORECASE)

    # Try parsing the modified date string
    try:
        return parser.parse(date_str, dayfirst=True)
    except ValueError:
        return pd.NaT

def compare_roosters(base_file, comparison_file, output_file):
    # Print the sheets available in both Excel files
    base_sheets = list_sheets(base_file)
    comparison_sheets = list_sheets(comparison_file)
    print(f"Sheets in '{base_file}': {base_sheets}")
    print(f"Sheets in '{comparison_file}': {comparison_sheets}")

    # Function to load an Excel file with error handling
    def load_excel(file):
        try:
            df = pd.read_excel(file, engine='openpyxl')
            if df.empty:
                raise ValueError(f"The file '{file}' has no sheets or is empty.")
            return df
        except Exception as e:
            print(e)
            raise ValueError(f"Could not load the file '{file}'. Please check the file and try again.")

    # Load the Excel files
    base_df = load_excel(base_file)
    comparison_df = load_excel(comparison_file)

    # Ensure the columns we need are present in both files
    required_columns = ['Code examenrooster', 'Beginuur S+', 'Datum S+', 'Einduur S+']
    for column in required_columns:
        if column not in base_df.columns or column not in comparison_df.columns:
            raise ValueError(f"Column '{column}' is missing from one of the files")

     # Convert 'Datum S+' in comparison_df to the universal format
    comparison_df['Datum S+'] = comparison_df['Datum S+'].apply(
        lambda x: dutch_date_parser(x) if isinstance(x, str) else x
    )

    # Merge the dataframes on 'Code examenrooster' to compare the rows with matching codes
    merged_df = base_df.merge(
        comparison_df,
        on='Code examenrooster',
        suffixes=('_base', '_comp'),
        how='outer',  # Outer join to capture all differences
        indicator=True  # Indicator to show if the row was in one or both files
    )

    # Create an empty list to store rows with differences
    differences = []

    # Iterate over each row to find discrepancies
    for _, row in merged_df.iterrows():
        row_data = {}
        # Only compare rows that exist in both files
        if row['_merge'] == 'both':
            differences_in_row = []

            # Compare the columns
            for column in ['Beginuur S+', 'Datum S+', 'Einduur S+']:
                base_value = row.get(f"{column}_base", pd.NA)
                comp_value = row.get(f"{column}_comp", pd.NA)

                if pd.isna(base_value) and pd.isna(comp_value):
                    continue  # Skip comparison if both are NaN
                elif base_value != comp_value:
                    differences_in_row.append(f"{column} differs (Base: {base_value}, Comp: {comp_value})")

            # If there are any differences in this row, add them to the differences list
            if differences_in_row:
                for col in required_columns:
                    row_data[col] = row.get(col, pd.NA)
                    row_data[f"{col}_comp"] = row.get(f"{col}_comp", pd.NA)
                row_data['Difference'] = "; ".join(differences_in_row)
                differences.append(row_data)

        elif row['_merge'] == 'left_only':
            differences.append({
                'Code examenrooster': row['Code examenrooster'],
                'Difference': "Row missing in comparison file"
            })
        elif row['_merge'] == 'right_only':
            differences.append({
                'Code examenrooster': row['Code examenrooster'],
                'Difference': "Row missing in base file"
            })

    # Create a DataFrame from the differences list
    differences_df = pd.DataFrame(differences)

    # Save the differences to an Excel file
    differences_df.to_excel(output_file, index=False)

    print(f"Differences saved to {output_file}")


# Example usage:
compare_roosters('afgewerkte.xlsx', 'bages rooster voor s.xlsx', 'differences_output.xlsx')
first commit 2024-11-18 10:59:21 +00:00			`import pandas as pd`
			`from openpyxl import load_workbook`
			`from dateutil import parser`
			`import re`



			`def list_sheets(file):`
			`try:`
			`workbook = load_workbook(filename=file, read_only=True)`
			`sheets = workbook.sheetnames`
			`return sheets`
			`except Exception as e:`
			`print(e)`
			`raise ValueError(f"Could not open the file '{file}'. Please check the file and try again.")`


			`def dutch_date_parser(date_str):`
			`# Remove Dutch day names`
			`day_name_pattern = r'\b(maandag\|dinsdag\|woensdag\|donderdag\|vrijdag\|zaterdag\|zondag)\b'`
			`date_str = re.sub(day_name_pattern, '', date_str, flags=re.IGNORECASE).strip()`

			`# Translate Dutch month names to English`
			`month_translation = {`
			`'januari': 'January', 'februari': 'February', 'maart': 'March',`
			`'april': 'April', 'mei': 'May', 'juni': 'June', 'juli': 'July',`
			`'augustus': 'August', 'september': 'September', 'oktober': 'October',`
			`'november': 'November', 'december': 'December'`
			`}`

			`for dutch_month, english_month in month_translation.items():`
			`date_str = re.sub(r'\b' + dutch_month + r'\b', english_month, date_str, flags=re.IGNORECASE)`

			`# Try parsing the modified date string`
			`try:`
			`return parser.parse(date_str, dayfirst=True)`
			`except ValueError:`
			`return pd.NaT`

			`def compare_roosters(base_file, comparison_file, output_file):`
			`# Print the sheets available in both Excel files`
			`base_sheets = list_sheets(base_file)`
			`comparison_sheets = list_sheets(comparison_file)`
			`print(f"Sheets in '{base_file}': {base_sheets}")`
			`print(f"Sheets in '{comparison_file}': {comparison_sheets}")`

			`# Function to load an Excel file with error handling`
			`def load_excel(file):`
			`try:`
			`df = pd.read_excel(file, engine='openpyxl')`
			`if df.empty:`
			`raise ValueError(f"The file '{file}' has no sheets or is empty.")`
			`return df`
			`except Exception as e:`
			`print(e)`
			`raise ValueError(f"Could not load the file '{file}'. Please check the file and try again.")`

			`# Load the Excel files`
			`base_df = load_excel(base_file)`
			`comparison_df = load_excel(comparison_file)`

			`# Ensure the columns we need are present in both files`
			`required_columns = ['Code examenrooster', 'Beginuur S+', 'Datum S+', 'Einduur S+']`
			`for column in required_columns:`
			`if column not in base_df.columns or column not in comparison_df.columns:`
			`raise ValueError(f"Column '{column}' is missing from one of the files")`

			`# Convert 'Datum S+' in comparison_df to the universal format`
			`comparison_df['Datum S+'] = comparison_df['Datum S+'].apply(`
			`lambda x: dutch_date_parser(x) if isinstance(x, str) else x`
			`)`

			`# Merge the dataframes on 'Code examenrooster' to compare the rows with matching codes`
			`merged_df = base_df.merge(`
			`comparison_df,`
			`on='Code examenrooster',`
			`suffixes=('_base', '_comp'),`
			`how='outer', # Outer join to capture all differences`
			`indicator=True # Indicator to show if the row was in one or both files`
			`)`

			`# Create an empty list to store rows with differences`
			`differences = []`

			`# Iterate over each row to find discrepancies`
			`for _, row in merged_df.iterrows():`
			`row_data = {}`
			`# Only compare rows that exist in both files`
			`if row['_merge'] == 'both':`
			`differences_in_row = []`

			`# Compare the columns`
			`for column in ['Beginuur S+', 'Datum S+', 'Einduur S+']:`
			`base_value = row.get(f"{column}_base", pd.NA)`
			`comp_value = row.get(f"{column}_comp", pd.NA)`

			`if pd.isna(base_value) and pd.isna(comp_value):`
			`continue # Skip comparison if both are NaN`
			`elif base_value != comp_value:`
			`differences_in_row.append(f"{column} differs (Base: {base_value}, Comp: {comp_value})")`

			`# If there are any differences in this row, add them to the differences list`
			`if differences_in_row:`
			`for col in required_columns:`
			`row_data[col] = row.get(col, pd.NA)`
			`row_data[f"{col}_comp"] = row.get(f"{col}_comp", pd.NA)`
			`row_data['Difference'] = "; ".join(differences_in_row)`
			`differences.append(row_data)`

			`elif row['_merge'] == 'left_only':`
			`differences.append({`
			`'Code examenrooster': row['Code examenrooster'],`
			`'Difference': "Row missing in comparison file"`
			`})`
			`elif row['_merge'] == 'right_only':`
			`differences.append({`
			`'Code examenrooster': row['Code examenrooster'],`
			`'Difference': "Row missing in base file"`
			`})`

			`# Create a DataFrame from the differences list`
			`differences_df = pd.DataFrame(differences)`

			`# Save the differences to an Excel file`
			`differences_df.to_excel(output_file, index=False)`

			`print(f"Differences saved to {output_file}")`


			`# Example usage:`
			`compare_roosters('afgewerkte.xlsx', 'bages rooster voor s.xlsx', 'differences_output.xlsx')`