From eac9bad1347798bd5840993e60dfbc05aebd7f7b Mon Sep 17 00:00:00 2001
From: bdaneels <brecht.daneels@hotmail.com>
Date: Tue, 29 Jul 2025 09:28:23 +0200
Subject: [PATCH] Add data processing scripts for exam data and comparison
 functionality

---
 startpakketten/checkheaders.py | 85 ++++++++++++++++++++++++++++++
 startpakketten/compare_sp.py   | 95 ++++++++++++++++++++++++++++++++++
 startpakketten/script.py       | 20 +++++++
 startpakketten/todo.md         |  4 ++
 4 files changed, 204 insertions(+)
 create mode 100644 startpakketten/checkheaders.py
 create mode 100644 startpakketten/compare_sp.py
 create mode 100644 startpakketten/todo.md

diff --git a/startpakketten/checkheaders.py b/startpakketten/checkheaders.py
new file mode 100644
index 0000000..b249dd4
--- /dev/null
+++ b/startpakketten/checkheaders.py
@@ -0,0 +1,85 @@
+import pandas as pd
+
+def check_headers_predelibfile(df):
+    # Check if the headers are already in the column names (first row)
+    if 'Achternaam' in df.columns and 'Voornaam' in df.columns:
+        print("Headers found in first row - file already processed, returning unchanged")
+        return df  # Return the dataframe unchanged
+    else:
+        # Find the row index where 'Achternaam' and 'Voornaam' appear as headers
+        header_row = None
+        for i, row in df.iterrows():
+            if 'Achternaam' in row.values and 'Voornaam' in row.values:
+                header_row = i
+                break
+
+    if header_row is not None:
+        # Delete all rows before the header row
+        df = df.iloc[header_row:].reset_index(drop=True)
+
+        # Set the first row as column headers
+        df.columns = df.iloc[0]
+        df = df.iloc[1:].reset_index(drop=True)
+
+        # Define the columns to keep
+        columns_to_keep = [
+            'ID', 'Achternaam', 'Voornaam', 'E-mail', 'Loopbaan',
+            'Drempelteller omschrijving', 'Programma status omschrijving',
+            'OO Periode', 'OO Studiegidsnummer', 'OO Lange omschrijving',
+            'OO Eenheden', 'OO Sessie', 'OO Credit (Y/N)', 'OO Periode credit',
+            'OO Programma code', 'OO Programma korte omschr.', 'Totaal aantal SP',
+            'Aantal SP vereist', 'Aantal SP zonder VZP', 'Adviesrapport code',
+            'Waarschuwing', 'Lijsttype'
+        ]
+
+        # Keep only the specified columns (only if they exist in the dataframe)
+        existing_columns = [col for col in columns_to_keep if col in df.columns]
+        df = df[existing_columns]
+
+        print(f"Deleted {header_row} rows, set proper headers, and kept {len(existing_columns)} columns")
+        return df
+    else:
+        print("Headers 'Achternaam' and 'Voornaam' not found in the file")
+        return df
+
+def check_headers_dashboard_inschrijvingenfile(df):
+     # Check if the headers are already in the column names (first row)
+    if 'Naam' in df.columns and 'Voornaam' in df.columns:
+        print("Headers found in first row  of dashboard_inschrijvingen - no need to search for header row")
+        header_row = -1  # Indicates headers are already set
+    else:
+        # Find the row index where 'Naam' and 'Voornaam' appear as headers
+        header_row = None
+        for i, row in df.iterrows():
+            if 'Naam' in row.values and 'Voornaam' in row.values:
+                header_row = i
+                break
+
+    if header_row is not None:
+        # Delete all rows before the header row
+        df = df.iloc[header_row:].reset_index(drop=True)
+
+        # Set the first row as column headers
+        df.columns = df.iloc[0]
+        df = df.iloc[1:].reset_index(drop=True)
+
+        if header_row is not None and header_row >= 0:
+            print(f"Deleted {header_row} rows in dashboard_file, set proper headers")
+        elif header_row == -1:
+            print(f"Headers were already correct in dashboard_file.")
+        
+        return df
+    else:
+        print("Headers 'Achternaam' and 'Voornaam' not found in the file")
+        return df
+
+
+if __name__ == "__main__":
+    # Read the Excel files
+    df_predelib = pd.read_excel('db.xlsx')
+    df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx')
+    
+    # Process the dataframes
+    processed_predelib_df = check_headers_predelibfile(df_predelib)
+    processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard)
+
diff --git a/startpakketten/compare_sp.py b/startpakketten/compare_sp.py
new file mode 100644
index 0000000..d125b15
--- /dev/null
+++ b/startpakketten/compare_sp.py
@@ -0,0 +1,95 @@
+import pandas as pd
+
+def compare_sp_values(predelib_df, dashboard_df):
+    """
+    Compare 'Totaal aantal SP' from predelib_df with 'Ingeschr. SP (intern)' from dashboard_df
+    for matching IDs between the two dataframes.
+    
+    Args:
+        predelib_df (pandas.DataFrame): Dataframe from predeliberation file with 'ID' and 'Totaal aantal SP' columns
+        dashboard_df (pandas.DataFrame): Dataframe from dashboard file with 'ID' and 'Ingeschr. SP (intern)' columns
+    
+    Returns:
+        list: List of dictionaries containing mismatches, or empty list if all match
+    """
+    if 'ID' not in predelib_df.columns:
+        print("Warning: 'ID' column not found in predelib dataframe")
+        return []
+    
+    if 'ID' not in dashboard_df.columns:
+        print("Warning: 'ID' column not found in dashboard dataframe")
+        return []
+    
+    if 'Totaal aantal SP' not in predelib_df.columns:
+        print("Warning: 'Totaal aantal SP' column not found in predelib dataframe")
+        return []
+    
+    if 'Ingeschr. SP (intern)' not in dashboard_df.columns:
+        print("Warning: 'Ingeschr. SP (intern)' column not found in dashboard dataframe")
+        return []
+    
+    # Find matching IDs
+    # First, let's debug the ID columns
+    print(f"Predelib ID column type: {predelib_df['ID'].dtype}")
+    print(f"Dashboard ID column type: {dashboard_df['ID'].dtype}")
+    print(f"Sample predelib IDs: {list(predelib_df['ID'].head())}")
+    print(f"Sample dashboard IDs: {list(dashboard_df['ID'].head())}")
+    
+    # Convert IDs to strings to ensure consistent comparison
+    predelib_ids = set(str(x) for x in predelib_df['ID'] if pd.notna(x))
+    dashboard_ids = set(str(x) for x in dashboard_df['ID'] if pd.notna(x))
+    
+    matching_ids = predelib_ids.intersection(dashboard_ids)
+    print(f"Found {len(matching_ids)} matching IDs between the two dataframes")
+    
+    if len(matching_ids) == 0:
+        print("No matching IDs found between the dataframes")
+        print(f"Total predelib IDs: {len(predelib_ids)}")
+        print(f"Total dashboard IDs: {len(dashboard_ids)}")
+        return []
+    
+    # Compare SP values for matching IDs
+    mismatches = []
+    for id_val in matching_ids:
+        # Convert back to original type for filtering (try both string and original)
+        predelib_matches = predelib_df[predelib_df['ID'].astype(str) == id_val]
+        dashboard_matches = dashboard_df[dashboard_df['ID'].astype(str) == id_val]
+        
+        if len(predelib_matches) == 0 or len(dashboard_matches) == 0:
+            continue
+            
+        predelib_sp = predelib_matches['Totaal aantal SP'].iloc[0]
+        dashboard_sp = dashboard_matches['Ingeschr. SP (intern)'].iloc[0]
+        
+        if predelib_sp != dashboard_sp:
+            mismatches.append({
+                'ID': id_val,
+                'Predelib_SP': predelib_sp,
+                'Dashboard_SP': dashboard_sp
+            })
+    
+    if len(mismatches) == 0:
+        print("All SP values match between the two dataframes!")
+    else:
+        print(f"Found {len(mismatches)} mismatches:")
+        for mismatch in mismatches:
+            print(f"  ID {mismatch['ID']}: Predelib={mismatch['Predelib_SP']}, Dashboard={mismatch['Dashboard_SP']}")
+    
+    return mismatches
+
+
+if __name__ == "__main__":
+    # Example usage - can be used for testing
+    from checkheaders import check_headers_predelibfile, check_headers_dashboard_inschrijvingenfile
+    
+    # Read the Excel files
+    df_predelib = pd.read_excel('db.xlsx')
+    df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx')
+    
+    # Process the dataframes
+    processed_predelib_df = check_headers_predelibfile(df_predelib)
+    processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard)
+    
+    # Compare SP values between the two processed dataframes
+    print("\nComparing SP values between predelib and dashboard files:")
+    mismatches = compare_sp_values(processed_predelib_df, processed_dashboard_df)
diff --git a/startpakketten/script.py b/startpakketten/script.py
index e69de29..628b480 100644
--- a/startpakketten/script.py
+++ b/startpakketten/script.py
@@ -0,0 +1,20 @@
+import pandas as pd
+
+from checkheaders import check_headers_dashboard_inschrijvingenfile, check_headers_predelibfile
+from compare_sp import compare_sp_values
+
+# Read the Excel file
+df_predelib = pd.read_excel('db.xlsx')
+df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx')
+processed_predelib_df = check_headers_predelibfile(df_predelib)
+processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard)
+
+
+# Further processing can be done with processed_predelib_df and processed_dashboard_df
+print("Processed Predelib DataFrame:")
+print(processed_predelib_df)
+print("\nProcessed Dashboard DataFrame:")
+print(processed_dashboard_df)
+
+compare_sp_values(processed_predelib_df, processed_dashboard_df)
+print("\nComparison complete.")
diff --git a/startpakketten/todo.md b/startpakketten/todo.md
new file mode 100644
index 0000000..0fe9883
--- /dev/null
+++ b/startpakketten/todo.md
@@ -0,0 +1,4 @@
+extracurriculaire vakken komen niet uit de wizard. aparte excel voor nodig, nl dashboard inschrijvingen.
+Voor de check of ze extra keuzevakken opnemen en de vereiste dus hoger moet komen te staan: vgl de kollomen Totaal aantal SP Aantal SP vereist
+
+deze kolom moet ook worden gechecked als ze eigenlijk minder opnemen; dus die moeten altijd aan elkaar gelijk zijn.