import pandas as pd from openpyxl import load_workbook from dateutil import parser import re def list_sheets(file): try: workbook = load_workbook(filename=file, read_only=True) sheets = workbook.sheetnames return sheets except Exception as e: print(e) raise ValueError(f"Could not open the file '{file}'. Please check the file and try again.") def dutch_date_parser(date_str): # Remove Dutch day names day_name_pattern = r'\b(maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)\b' date_str = re.sub(day_name_pattern, '', date_str, flags=re.IGNORECASE).strip() # Translate Dutch month names to English month_translation = { 'januari': 'January', 'februari': 'February', 'maart': 'March', 'april': 'April', 'mei': 'May', 'juni': 'June', 'juli': 'July', 'augustus': 'August', 'september': 'September', 'oktober': 'October', 'november': 'November', 'december': 'December' } for dutch_month, english_month in month_translation.items(): date_str = re.sub(r'\b' + dutch_month + r'\b', english_month, date_str, flags=re.IGNORECASE) # Try parsing the modified date string try: return parser.parse(date_str, dayfirst=True) except ValueError: return pd.NaT def compare_roosters(base_file, comparison_file, output_file): # Print the sheets available in both Excel files base_sheets = list_sheets(base_file) comparison_sheets = list_sheets(comparison_file) print(f"Sheets in '{base_file}': {base_sheets}") print(f"Sheets in '{comparison_file}': {comparison_sheets}") # Function to load an Excel file with error handling def load_excel(file): try: df = pd.read_excel(file, engine='openpyxl') if df.empty: raise ValueError(f"The file '{file}' has no sheets or is empty.") return df except Exception as e: print(e) raise ValueError(f"Could not load the file '{file}'. Please check the file and try again.") # Load the Excel files base_df = load_excel(base_file) comparison_df = load_excel(comparison_file) # Ensure the columns we need are present in both files required_columns = ['Code examenrooster', 'Beginuur S+', 'Datum S+', 'Einduur S+'] for column in required_columns: if column not in base_df.columns or column not in comparison_df.columns: raise ValueError(f"Column '{column}' is missing from one of the files") # Convert 'Datum S+' in comparison_df to the universal format comparison_df['Datum S+'] = comparison_df['Datum S+'].apply( lambda x: dutch_date_parser(x) if isinstance(x, str) else x ) # Merge the dataframes on 'Code examenrooster' to compare the rows with matching codes merged_df = base_df.merge( comparison_df, on='Code examenrooster', suffixes=('_base', '_comp'), how='outer', # Outer join to capture all differences indicator=True # Indicator to show if the row was in one or both files ) # Create an empty list to store rows with differences differences = [] # Iterate over each row to find discrepancies for _, row in merged_df.iterrows(): row_data = {} # Only compare rows that exist in both files if row['_merge'] == 'both': differences_in_row = [] # Compare the columns for column in ['Beginuur S+', 'Datum S+', 'Einduur S+']: base_value = row.get(f"{column}_base", pd.NA) comp_value = row.get(f"{column}_comp", pd.NA) if pd.isna(base_value) and pd.isna(comp_value): continue # Skip comparison if both are NaN elif base_value != comp_value: differences_in_row.append(f"{column} differs (Base: {base_value}, Comp: {comp_value})") # If there are any differences in this row, add them to the differences list if differences_in_row: for col in required_columns: row_data[col] = row.get(col, pd.NA) row_data[f"{col}_comp"] = row.get(f"{col}_comp", pd.NA) row_data['Difference'] = "; ".join(differences_in_row) differences.append(row_data) elif row['_merge'] == 'left_only': differences.append({ 'Code examenrooster': row['Code examenrooster'], 'Difference': "Row missing in comparison file" }) elif row['_merge'] == 'right_only': differences.append({ 'Code examenrooster': row['Code examenrooster'], 'Difference': "Row missing in base file" }) # Create a DataFrame from the differences list differences_df = pd.DataFrame(differences) # Save the differences to an Excel file differences_df.to_excel(output_file, index=False) print(f"Differences saved to {output_file}") # Example usage: compare_roosters('afgewerkte.xlsx', 'bages rooster voor s.xlsx', 'differences_output.xlsx')