ows-master/random/script 2.py

132 lines
5.0 KiB
Python
Raw Permalink Normal View History

2024-11-18 10:59:21 +00:00
import pandas as pd
from openpyxl import load_workbook
from dateutil import parser
import re
def list_sheets(file):
try:
workbook = load_workbook(filename=file, read_only=True)
sheets = workbook.sheetnames
return sheets
except Exception as e:
print(e)
raise ValueError(f"Could not open the file '{file}'. Please check the file and try again.")
def dutch_date_parser(date_str):
# Remove Dutch day names
day_name_pattern = r'\b(maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)\b'
date_str = re.sub(day_name_pattern, '', date_str, flags=re.IGNORECASE).strip()
# Translate Dutch month names to English
month_translation = {
'januari': 'January', 'februari': 'February', 'maart': 'March',
'april': 'April', 'mei': 'May', 'juni': 'June', 'juli': 'July',
'augustus': 'August', 'september': 'September', 'oktober': 'October',
'november': 'November', 'december': 'December'
}
for dutch_month, english_month in month_translation.items():
date_str = re.sub(r'\b' + dutch_month + r'\b', english_month, date_str, flags=re.IGNORECASE)
# Try parsing the modified date string
try:
return parser.parse(date_str, dayfirst=True)
except ValueError:
return pd.NaT
def compare_roosters(base_file, comparison_file, output_file):
# Print the sheets available in both Excel files
base_sheets = list_sheets(base_file)
comparison_sheets = list_sheets(comparison_file)
print(f"Sheets in '{base_file}': {base_sheets}")
print(f"Sheets in '{comparison_file}': {comparison_sheets}")
# Function to load an Excel file with error handling
def load_excel(file):
try:
df = pd.read_excel(file, engine='openpyxl')
if df.empty:
raise ValueError(f"The file '{file}' has no sheets or is empty.")
return df
except Exception as e:
print(e)
raise ValueError(f"Could not load the file '{file}'. Please check the file and try again.")
# Load the Excel files
base_df = load_excel(base_file)
comparison_df = load_excel(comparison_file)
# Ensure the columns we need are present in both files
required_columns = ['Code examenrooster', 'Beginuur S+', 'Datum S+', 'Einduur S+']
for column in required_columns:
if column not in base_df.columns or column not in comparison_df.columns:
raise ValueError(f"Column '{column}' is missing from one of the files")
# Convert 'Datum S+' in comparison_df to the universal format
comparison_df['Datum S+'] = comparison_df['Datum S+'].apply(
lambda x: dutch_date_parser(x) if isinstance(x, str) else x
)
# Merge the dataframes on 'Code examenrooster' to compare the rows with matching codes
merged_df = base_df.merge(
comparison_df,
on='Code examenrooster',
suffixes=('_base', '_comp'),
how='outer', # Outer join to capture all differences
indicator=True # Indicator to show if the row was in one or both files
)
# Create an empty list to store rows with differences
differences = []
# Iterate over each row to find discrepancies
for _, row in merged_df.iterrows():
row_data = {}
# Only compare rows that exist in both files
if row['_merge'] == 'both':
differences_in_row = []
# Compare the columns
for column in ['Beginuur S+', 'Datum S+', 'Einduur S+']:
base_value = row.get(f"{column}_base", pd.NA)
comp_value = row.get(f"{column}_comp", pd.NA)
if pd.isna(base_value) and pd.isna(comp_value):
continue # Skip comparison if both are NaN
elif base_value != comp_value:
differences_in_row.append(f"{column} differs (Base: {base_value}, Comp: {comp_value})")
# If there are any differences in this row, add them to the differences list
if differences_in_row:
for col in required_columns:
row_data[col] = row.get(col, pd.NA)
row_data[f"{col}_comp"] = row.get(f"{col}_comp", pd.NA)
row_data['Difference'] = "; ".join(differences_in_row)
differences.append(row_data)
elif row['_merge'] == 'left_only':
differences.append({
'Code examenrooster': row['Code examenrooster'],
'Difference': "Row missing in comparison file"
})
elif row['_merge'] == 'right_only':
differences.append({
'Code examenrooster': row['Code examenrooster'],
'Difference': "Row missing in base file"
})
# Create a DataFrame from the differences list
differences_df = pd.DataFrame(differences)
# Save the differences to an Excel file
differences_df.to_excel(output_file, index=False)
print(f"Differences saved to {output_file}")
# Example usage:
compare_roosters('afgewerkte.xlsx', 'bages rooster voor s.xlsx', 'differences_output.xlsx')