Refactor code structure and remove redundant code blocks

2025-10-21 14:20:04 +02:00
parent bcf8f3acae
commit 468df81386
10 changed files with 2054 additions and 19 deletions
--- a/startpakketten/process_predelib_file.py
+++ b/startpakketten/process_predelib_file.py
@@ -181,22 +181,49 @@ def check_students_with_mismatching_SP_values(predelib_df: pd.DataFrame) -> List
        
        logger.info("All required columns found in dataframe")

-        # Check for mismatching SP values
-        mismatching_students = []
-        for index, row in predelib_df.iterrows():
-            if row['Totaal aantal SP'] != row['Aantal SP vereist']:
-                mismatching_students.append({
-                    'ID': row['ID'],
-                    'Achternaam': row['Achternaam'],
-                    'Voornaam': row['Voornaam'],
-                    'E-mail': row['E-mail'],
-                    'Totaal_aantal_SP': row['Totaal aantal SP'],
-                    'Aantal_SP_vereist': row['Aantal SP vereist'],
-                    'Waarschuwing': row['Waarschuwing'],
-                    'Adviesrapport_code': row['Adviesrapport code']
-                })
+        # Use vectorized comparison to find rows where the SP values differ
+        sp_col = predelib_df['Totaal aantal SP']
+        req_col = predelib_df['Aantal SP vereist']

-        logger.info(f"Found {len(mismatching_students)} students with mismatching SP values")
+        # Simple inequality works for most cases; NaN != NaN will be True which is acceptable
+        mask = sp_col != req_col
+        mismatches_df = predelib_df[mask].copy()
+
+        logger.info(f"Found {len(mismatches_df)} raw rows with mismatching SP values")
+
+        if mismatches_df.empty:
+            logger.info("No students with mismatching SP values found")
+            return []
+
+        # Keep only unique students by 'ID' (first occurrence).
+        if 'ID' in mismatches_df.columns:
+            before_dedup = len(mismatches_df)
+            mismatches_df = mismatches_df.drop_duplicates(subset=['ID'])
+            after_dedup = len(mismatches_df)
+            logger.info(f"Reduced from {before_dedup} rows to {after_dedup} unique students by ID")
+        else:
+            logger.warning("Column 'ID' not found - cannot deduplicate by student ID")
+
+        # Ensure optional columns exist to avoid KeyError when building dicts
+        for optional_col in ('Waarschuwing', 'Adviesrapport code'):
+            if optional_col not in mismatches_df.columns:
+                mismatches_df[optional_col] = None
+
+        # Build the list of mismatching students
+        mismatching_students = []
+        for _, row in mismatches_df.iterrows():
+            mismatching_students.append({
+                'ID': row.get('ID'),
+                'Achternaam': row.get('Achternaam'),
+                'Voornaam': row.get('Voornaam'),
+                'E-mail': row.get('E-mail'),
+                'Totaal_aantal_SP': row.get('Totaal aantal SP'),
+                'Aantal_SP_vereist': row.get('Aantal SP vereist'),
+                'Waarschuwing': row.get('Waarschuwing'),
+                'Adviesrapport_code': row.get('Adviesrapport code')
+            })
+
+        logger.info(f"Returning {len(mismatching_students)} unique students with mismatching SP values")
        return mismatching_students

    except Exception as e:
--- a/startpakketten/startpakket_processing.log
+++ b/startpakketten/startpakket_processing.log
--- a/startpakketten/todo.md
+++ b/startpakketten/todo.md
@@ -1,4 +0,0 @@
-extracurriculaire vakken komen niet uit de wizard. aparte excel voor nodig, nl dashboard inschrijvingen.
-Voor de check of ze extra keuzevakken opnemen en de vereiste dus hoger moet komen te staan: vgl de kollomen Totaal aantal SP Aantal SP vereist
-
-deze kolom moet ook worden gechecked als ze eigenlijk minder opnemen; dus die moeten altijd aan elkaar gelijk zijn.