From eac9bad1347798bd5840993e60dfbc05aebd7f7b Mon Sep 17 00:00:00 2001 From: bdaneels Date: Tue, 29 Jul 2025 09:28:23 +0200 Subject: [PATCH] Add data processing scripts for exam data and comparison functionality --- startpakketten/checkheaders.py | 85 ++++++++++++++++++++++++++++++ startpakketten/compare_sp.py | 95 ++++++++++++++++++++++++++++++++++ startpakketten/script.py | 20 +++++++ startpakketten/todo.md | 4 ++ 4 files changed, 204 insertions(+) create mode 100644 startpakketten/checkheaders.py create mode 100644 startpakketten/compare_sp.py create mode 100644 startpakketten/todo.md diff --git a/startpakketten/checkheaders.py b/startpakketten/checkheaders.py new file mode 100644 index 0000000..b249dd4 --- /dev/null +++ b/startpakketten/checkheaders.py @@ -0,0 +1,85 @@ +import pandas as pd + +def check_headers_predelibfile(df): + # Check if the headers are already in the column names (first row) + if 'Achternaam' in df.columns and 'Voornaam' in df.columns: + print("Headers found in first row - file already processed, returning unchanged") + return df # Return the dataframe unchanged + else: + # Find the row index where 'Achternaam' and 'Voornaam' appear as headers + header_row = None + for i, row in df.iterrows(): + if 'Achternaam' in row.values and 'Voornaam' in row.values: + header_row = i + break + + if header_row is not None: + # Delete all rows before the header row + df = df.iloc[header_row:].reset_index(drop=True) + + # Set the first row as column headers + df.columns = df.iloc[0] + df = df.iloc[1:].reset_index(drop=True) + + # Define the columns to keep + columns_to_keep = [ + 'ID', 'Achternaam', 'Voornaam', 'E-mail', 'Loopbaan', + 'Drempelteller omschrijving', 'Programma status omschrijving', + 'OO Periode', 'OO Studiegidsnummer', 'OO Lange omschrijving', + 'OO Eenheden', 'OO Sessie', 'OO Credit (Y/N)', 'OO Periode credit', + 'OO Programma code', 'OO Programma korte omschr.', 'Totaal aantal SP', + 'Aantal SP vereist', 'Aantal SP zonder VZP', 'Adviesrapport code', + 'Waarschuwing', 'Lijsttype' + ] + + # Keep only the specified columns (only if they exist in the dataframe) + existing_columns = [col for col in columns_to_keep if col in df.columns] + df = df[existing_columns] + + print(f"Deleted {header_row} rows, set proper headers, and kept {len(existing_columns)} columns") + return df + else: + print("Headers 'Achternaam' and 'Voornaam' not found in the file") + return df + +def check_headers_dashboard_inschrijvingenfile(df): + # Check if the headers are already in the column names (first row) + if 'Naam' in df.columns and 'Voornaam' in df.columns: + print("Headers found in first row of dashboard_inschrijvingen - no need to search for header row") + header_row = -1 # Indicates headers are already set + else: + # Find the row index where 'Naam' and 'Voornaam' appear as headers + header_row = None + for i, row in df.iterrows(): + if 'Naam' in row.values and 'Voornaam' in row.values: + header_row = i + break + + if header_row is not None: + # Delete all rows before the header row + df = df.iloc[header_row:].reset_index(drop=True) + + # Set the first row as column headers + df.columns = df.iloc[0] + df = df.iloc[1:].reset_index(drop=True) + + if header_row is not None and header_row >= 0: + print(f"Deleted {header_row} rows in dashboard_file, set proper headers") + elif header_row == -1: + print(f"Headers were already correct in dashboard_file.") + + return df + else: + print("Headers 'Achternaam' and 'Voornaam' not found in the file") + return df + + +if __name__ == "__main__": + # Read the Excel files + df_predelib = pd.read_excel('db.xlsx') + df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx') + + # Process the dataframes + processed_predelib_df = check_headers_predelibfile(df_predelib) + processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard) + diff --git a/startpakketten/compare_sp.py b/startpakketten/compare_sp.py new file mode 100644 index 0000000..d125b15 --- /dev/null +++ b/startpakketten/compare_sp.py @@ -0,0 +1,95 @@ +import pandas as pd + +def compare_sp_values(predelib_df, dashboard_df): + """ + Compare 'Totaal aantal SP' from predelib_df with 'Ingeschr. SP (intern)' from dashboard_df + for matching IDs between the two dataframes. + + Args: + predelib_df (pandas.DataFrame): Dataframe from predeliberation file with 'ID' and 'Totaal aantal SP' columns + dashboard_df (pandas.DataFrame): Dataframe from dashboard file with 'ID' and 'Ingeschr. SP (intern)' columns + + Returns: + list: List of dictionaries containing mismatches, or empty list if all match + """ + if 'ID' not in predelib_df.columns: + print("Warning: 'ID' column not found in predelib dataframe") + return [] + + if 'ID' not in dashboard_df.columns: + print("Warning: 'ID' column not found in dashboard dataframe") + return [] + + if 'Totaal aantal SP' not in predelib_df.columns: + print("Warning: 'Totaal aantal SP' column not found in predelib dataframe") + return [] + + if 'Ingeschr. SP (intern)' not in dashboard_df.columns: + print("Warning: 'Ingeschr. SP (intern)' column not found in dashboard dataframe") + return [] + + # Find matching IDs + # First, let's debug the ID columns + print(f"Predelib ID column type: {predelib_df['ID'].dtype}") + print(f"Dashboard ID column type: {dashboard_df['ID'].dtype}") + print(f"Sample predelib IDs: {list(predelib_df['ID'].head())}") + print(f"Sample dashboard IDs: {list(dashboard_df['ID'].head())}") + + # Convert IDs to strings to ensure consistent comparison + predelib_ids = set(str(x) for x in predelib_df['ID'] if pd.notna(x)) + dashboard_ids = set(str(x) for x in dashboard_df['ID'] if pd.notna(x)) + + matching_ids = predelib_ids.intersection(dashboard_ids) + print(f"Found {len(matching_ids)} matching IDs between the two dataframes") + + if len(matching_ids) == 0: + print("No matching IDs found between the dataframes") + print(f"Total predelib IDs: {len(predelib_ids)}") + print(f"Total dashboard IDs: {len(dashboard_ids)}") + return [] + + # Compare SP values for matching IDs + mismatches = [] + for id_val in matching_ids: + # Convert back to original type for filtering (try both string and original) + predelib_matches = predelib_df[predelib_df['ID'].astype(str) == id_val] + dashboard_matches = dashboard_df[dashboard_df['ID'].astype(str) == id_val] + + if len(predelib_matches) == 0 or len(dashboard_matches) == 0: + continue + + predelib_sp = predelib_matches['Totaal aantal SP'].iloc[0] + dashboard_sp = dashboard_matches['Ingeschr. SP (intern)'].iloc[0] + + if predelib_sp != dashboard_sp: + mismatches.append({ + 'ID': id_val, + 'Predelib_SP': predelib_sp, + 'Dashboard_SP': dashboard_sp + }) + + if len(mismatches) == 0: + print("All SP values match between the two dataframes!") + else: + print(f"Found {len(mismatches)} mismatches:") + for mismatch in mismatches: + print(f" ID {mismatch['ID']}: Predelib={mismatch['Predelib_SP']}, Dashboard={mismatch['Dashboard_SP']}") + + return mismatches + + +if __name__ == "__main__": + # Example usage - can be used for testing + from checkheaders import check_headers_predelibfile, check_headers_dashboard_inschrijvingenfile + + # Read the Excel files + df_predelib = pd.read_excel('db.xlsx') + df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx') + + # Process the dataframes + processed_predelib_df = check_headers_predelibfile(df_predelib) + processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard) + + # Compare SP values between the two processed dataframes + print("\nComparing SP values between predelib and dashboard files:") + mismatches = compare_sp_values(processed_predelib_df, processed_dashboard_df) diff --git a/startpakketten/script.py b/startpakketten/script.py index e69de29..628b480 100644 --- a/startpakketten/script.py +++ b/startpakketten/script.py @@ -0,0 +1,20 @@ +import pandas as pd + +from checkheaders import check_headers_dashboard_inschrijvingenfile, check_headers_predelibfile +from compare_sp import compare_sp_values + +# Read the Excel file +df_predelib = pd.read_excel('db.xlsx') +df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx') +processed_predelib_df = check_headers_predelibfile(df_predelib) +processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard) + + +# Further processing can be done with processed_predelib_df and processed_dashboard_df +print("Processed Predelib DataFrame:") +print(processed_predelib_df) +print("\nProcessed Dashboard DataFrame:") +print(processed_dashboard_df) + +compare_sp_values(processed_predelib_df, processed_dashboard_df) +print("\nComparison complete.") diff --git a/startpakketten/todo.md b/startpakketten/todo.md new file mode 100644 index 0000000..0fe9883 --- /dev/null +++ b/startpakketten/todo.md @@ -0,0 +1,4 @@ +extracurriculaire vakken komen niet uit de wizard. aparte excel voor nodig, nl dashboard inschrijvingen. +Voor de check of ze extra keuzevakken opnemen en de vereiste dus hoger moet komen te staan: vgl de kollomen Totaal aantal SP Aantal SP vereist + +deze kolom moet ook worden gechecked als ze eigenlijk minder opnemen; dus die moeten altijd aan elkaar gelijk zijn.