132 lines
5.0 KiB
Python
132 lines
5.0 KiB
Python
import pandas as pd
|
|
from openpyxl import load_workbook
|
|
from dateutil import parser
|
|
import re
|
|
|
|
|
|
|
|
def list_sheets(file):
|
|
try:
|
|
workbook = load_workbook(filename=file, read_only=True)
|
|
sheets = workbook.sheetnames
|
|
return sheets
|
|
except Exception as e:
|
|
print(e)
|
|
raise ValueError(f"Could not open the file '{file}'. Please check the file and try again.")
|
|
|
|
|
|
def dutch_date_parser(date_str):
|
|
# Remove Dutch day names
|
|
day_name_pattern = r'\b(maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)\b'
|
|
date_str = re.sub(day_name_pattern, '', date_str, flags=re.IGNORECASE).strip()
|
|
|
|
# Translate Dutch month names to English
|
|
month_translation = {
|
|
'januari': 'January', 'februari': 'February', 'maart': 'March',
|
|
'april': 'April', 'mei': 'May', 'juni': 'June', 'juli': 'July',
|
|
'augustus': 'August', 'september': 'September', 'oktober': 'October',
|
|
'november': 'November', 'december': 'December'
|
|
}
|
|
|
|
for dutch_month, english_month in month_translation.items():
|
|
date_str = re.sub(r'\b' + dutch_month + r'\b', english_month, date_str, flags=re.IGNORECASE)
|
|
|
|
# Try parsing the modified date string
|
|
try:
|
|
return parser.parse(date_str, dayfirst=True)
|
|
except ValueError:
|
|
return pd.NaT
|
|
|
|
def compare_roosters(base_file, comparison_file, output_file):
|
|
# Print the sheets available in both Excel files
|
|
base_sheets = list_sheets(base_file)
|
|
comparison_sheets = list_sheets(comparison_file)
|
|
print(f"Sheets in '{base_file}': {base_sheets}")
|
|
print(f"Sheets in '{comparison_file}': {comparison_sheets}")
|
|
|
|
# Function to load an Excel file with error handling
|
|
def load_excel(file):
|
|
try:
|
|
df = pd.read_excel(file, engine='openpyxl')
|
|
if df.empty:
|
|
raise ValueError(f"The file '{file}' has no sheets or is empty.")
|
|
return df
|
|
except Exception as e:
|
|
print(e)
|
|
raise ValueError(f"Could not load the file '{file}'. Please check the file and try again.")
|
|
|
|
# Load the Excel files
|
|
base_df = load_excel(base_file)
|
|
comparison_df = load_excel(comparison_file)
|
|
|
|
# Ensure the columns we need are present in both files
|
|
required_columns = ['Code examenrooster', 'Beginuur S+', 'Datum S+', 'Einduur S+']
|
|
for column in required_columns:
|
|
if column not in base_df.columns or column not in comparison_df.columns:
|
|
raise ValueError(f"Column '{column}' is missing from one of the files")
|
|
|
|
# Convert 'Datum S+' in comparison_df to the universal format
|
|
comparison_df['Datum S+'] = comparison_df['Datum S+'].apply(
|
|
lambda x: dutch_date_parser(x) if isinstance(x, str) else x
|
|
)
|
|
|
|
# Merge the dataframes on 'Code examenrooster' to compare the rows with matching codes
|
|
merged_df = base_df.merge(
|
|
comparison_df,
|
|
on='Code examenrooster',
|
|
suffixes=('_base', '_comp'),
|
|
how='outer', # Outer join to capture all differences
|
|
indicator=True # Indicator to show if the row was in one or both files
|
|
)
|
|
|
|
# Create an empty list to store rows with differences
|
|
differences = []
|
|
|
|
# Iterate over each row to find discrepancies
|
|
for _, row in merged_df.iterrows():
|
|
row_data = {}
|
|
# Only compare rows that exist in both files
|
|
if row['_merge'] == 'both':
|
|
differences_in_row = []
|
|
|
|
# Compare the columns
|
|
for column in ['Beginuur S+', 'Datum S+', 'Einduur S+']:
|
|
base_value = row.get(f"{column}_base", pd.NA)
|
|
comp_value = row.get(f"{column}_comp", pd.NA)
|
|
|
|
if pd.isna(base_value) and pd.isna(comp_value):
|
|
continue # Skip comparison if both are NaN
|
|
elif base_value != comp_value:
|
|
differences_in_row.append(f"{column} differs (Base: {base_value}, Comp: {comp_value})")
|
|
|
|
# If there are any differences in this row, add them to the differences list
|
|
if differences_in_row:
|
|
for col in required_columns:
|
|
row_data[col] = row.get(col, pd.NA)
|
|
row_data[f"{col}_comp"] = row.get(f"{col}_comp", pd.NA)
|
|
row_data['Difference'] = "; ".join(differences_in_row)
|
|
differences.append(row_data)
|
|
|
|
elif row['_merge'] == 'left_only':
|
|
differences.append({
|
|
'Code examenrooster': row['Code examenrooster'],
|
|
'Difference': "Row missing in comparison file"
|
|
})
|
|
elif row['_merge'] == 'right_only':
|
|
differences.append({
|
|
'Code examenrooster': row['Code examenrooster'],
|
|
'Difference': "Row missing in base file"
|
|
})
|
|
|
|
# Create a DataFrame from the differences list
|
|
differences_df = pd.DataFrame(differences)
|
|
|
|
# Save the differences to an Excel file
|
|
differences_df.to_excel(output_file, index=False)
|
|
|
|
print(f"Differences saved to {output_file}")
|
|
|
|
|
|
# Example usage:
|
|
compare_roosters('afgewerkte.xlsx', 'bages rooster voor s.xlsx', 'differences_output.xlsx')
|