diff --git a/.gitignore b/.gitignore index b83c864..69c9f22 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,8 @@ __pycache__/ # Ignore Excel files *.xlsx -sisa_crawl/ \ No newline at end of file +# Ignore log files +*.log + +sisa_crawl/ + diff --git a/startpakketten/compare_sp.py b/startpakketten/compare_sp.py index 0508784..d02d66e 100644 --- a/startpakketten/compare_sp.py +++ b/startpakketten/compare_sp.py @@ -107,6 +107,7 @@ def compare_sp_values(predelib_df: pd.DataFrame, dashboard_df: pd.DataFrame) -> predelib_sp = predelib_matches['Totaal aantal SP'].iloc[0] dashboard_sp = dashboard_matches['Ingeschr. SP (intern)'].iloc[0] + name_student = predelib_matches['Voornaam'].iloc[0] + ' ' + predelib_matches['Achternaam'].iloc[0] # Handle potential NaN values if pd.isna(predelib_sp) or pd.isna(dashboard_sp): @@ -126,8 +127,10 @@ def compare_sp_values(predelib_df: pd.DataFrame, dashboard_df: pd.DataFrame) -> if predelib_sp_num != dashboard_sp_num: mismatch = { 'ID': id_val, + 'Name': name_student, 'Predelib_SP': predelib_sp, - 'Dashboard_SP': dashboard_sp + 'Dashboard_SP': dashboard_sp, + } mismatches.append(mismatch) logger.debug(f"Mismatch found for ID {id_val}: Predelib={predelib_sp}, Dashboard={dashboard_sp}") @@ -145,7 +148,7 @@ def compare_sp_values(predelib_df: pd.DataFrame, dashboard_df: pd.DataFrame) -> else: logger.warning(f"Found {len(mismatches)} mismatches") for mismatch in mismatches: - logger.info(f"Mismatch - ID {mismatch['ID']}: Predeliberatierapport SP={mismatch['Predelib_SP']}, Dashboard Inschrijvingen SP={mismatch['Dashboard_SP']}") + logger.info(f"Mismatch - ID {mismatch['ID']} ({mismatch['Name']}): Predeliberatierapport SP={mismatch['Predelib_SP']}, Dashboard Inschrijvingen SP={mismatch['Dashboard_SP']}") return mismatches diff --git a/startpakketten/process_predelib_file.py b/startpakketten/process_predelib_file.py new file mode 100644 index 0000000..6c2d93d --- /dev/null +++ b/startpakketten/process_predelib_file.py @@ -0,0 +1,223 @@ +import pandas as pd +import logging +from typing import List, Dict, Any, Optional + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('predelib_processing.log'), + logging.StreamHandler() + ] +) + +logger = logging.getLogger(__name__) + + +def check_students_with_fail_adviesrapport(predelib_df: pd.DataFrame) -> List[Dict[str, Any]]: + """ + Check for students with 'FAIL' in 'Adviesrapport code' column and extract their details. + + Args: + predelib_df (pandas.DataFrame): Processed predeliberation dataframe + + Returns: + list: List of dictionaries containing failed student details + + Raises: + ValueError: If input dataframe is invalid + KeyError: If required columns are missing + """ + logger.info("Starting failed students check") + + try: + # Validate input dataframe + if predelib_df is None or predelib_df.empty: + error_msg = "Predelib dataframe is None or empty" + logger.error(error_msg) + raise ValueError(error_msg) + + logger.info(f"Predelib dataframe shape: {predelib_df.shape}") + + # Define required columns + required_columns = [ + 'ID', 'Achternaam', 'Voornaam', 'E-mail', + 'Totaal aantal SP', 'Aantal SP vereist', 'Waarschuwing', 'Adviesrapport code' + ] + + # Check for required columns + missing_columns = [col for col in required_columns if col not in predelib_df.columns] + + if missing_columns: + error_msg = f"Missing required columns in predelib dataframe: {missing_columns}" + logger.error(error_msg) + logger.info(f"Available columns: {list(predelib_df.columns)}") + raise KeyError(error_msg) + + logger.info("All required columns found in dataframe") + + # Debug Adviesrapport code column + logger.debug(f"Adviesrapport code column type: {predelib_df['Adviesrapport code'].dtype}") + unique_codes = predelib_df['Adviesrapport code'].unique() + logger.debug(f"Unique Adviesrapport codes: {unique_codes}") + + # Filter for FAIL cases + try: + # Convert to string and check for FAIL (case-insensitive) + fail_mask = predelib_df['Adviesrapport code'].astype(str).str.upper() == 'FAIL' + students_with_fail_ar_df = predelib_df[fail_mask].copy() + + logger.info(f"Found {len(students_with_fail_ar_df)} students with FAIL status") + + # Remove duplicate rows (exact same values in all columns) + initial_count = len(students_with_fail_ar_df) + students_with_fail_ar_df = students_with_fail_ar_df.drop_duplicates() + final_count = len(students_with_fail_ar_df) + + duplicates_removed = initial_count - final_count + if duplicates_removed > 0: + logger.info(f"Removed {duplicates_removed} duplicate rows") + else: + logger.info("No duplicate rows found") + + logger.info(f"Final count after duplicate removal: {final_count} students with FAIL status") + + except Exception as e: + error_msg = f"Error filtering for FAIL status: {e}" + logger.error(error_msg) + raise ValueError(error_msg) + + if len(students_with_fail_ar_df) == 0: + logger.info("No students with FAIL status found") + return [] + + # Extract details for failed students + students_with_fail_ar = [] + processed_count = 0 + + for index, row in students_with_fail_ar_df.iterrows(): + try: + # Extract student details + student_details = { + 'ID': row['ID'], + 'Achternaam': row['Achternaam'], + 'Voornaam': row['Voornaam'], + 'E-mail': row['E-mail'], + 'Totaal_aantal_SP': row['Totaal aantal SP'], + 'Aantal_SP_vereist': row['Aantal SP vereist'], + 'Waarschuwing': row['Waarschuwing'], + 'Adviesrapport_code': row['Adviesrapport code'] + } + + # Handle potential NaN values + for key, value in student_details.items(): + if pd.isna(value): + student_details[key] = None + logger.warning(f"NaN value found for {key} in student ID: {row['ID']}") + + students_with_fail_ar.append(student_details) + processed_count += 1 + + logger.debug(f"Processed failed student: ID={row['ID']}, " + f"Name={row['Achternaam']}, {row['Voornaam']}") + + except Exception as e: + logger.error(f"Error processing student at index {index}: {e}") + continue + + logger.info(f"Successfully processed {processed_count} failed students") + + # Log summary + if students_with_fail_ar: + logger.warning(f"Found {len(students_with_fail_ar)} students with FAIL status") + for student in students_with_fail_ar: + logger.info(f"Failed student - ID: {student['ID']}, " + f"Name: {student['Achternaam']}, {student['Voornaam']}, " + f"SP: {student['Totaal_aantal_SP']}/{student['Aantal_SP_vereist']}") + else: + logger.info("No failed students found") + + return students_with_fail_ar + + except Exception as e: + logger.error(f"Unexpected error in check_students_with_fail_ar: {e}") + raise + + +def print_students_with_fail_ar_summary(students_with_fail_ar: List[Dict[str, Any]], predelib_df: pd.DataFrame): + """Print a formatted summary of students with FAIL status""" + print(f"\n{'='*80}") + print("Students with FAIL AR status report") + print(f"{'='*80}") + print(f"Total students processed: {len(predelib_df)}") + print(f"Students with FAIL status: {len(students_with_fail_ar)}") + + if students_with_fail_ar: + print(f"\nDetailed failed students list:") + print(f"{'ID':<10} {'Name':<25} {'Email':<30} {'SP':<15} {'Warning':<20}") + print(f"{'-'*10} {'-'*25} {'-'*30} {'-'*15} {'-'*20}") + + for student in students_with_fail_ar: + name = f"{student['Achternaam']}, {student['Voornaam']}" + sp_info = f"{student['Totaal_aantal_SP']}/{student['Aantal_SP_vereist']}" + warning = str(student['Waarschuwing']) if student['Waarschuwing'] else "None" + + print(f"{str(student['ID']):<10} {name[:25]:<25} {str(student['E-mail'])[:30]:<30} " + f"{sp_info:<15} {warning[:20]:<20}") + else: + print("\n✅ No students with FAIL status found!") + + print(f"{'='*80}") + + +if __name__ == "__main__": + # Example usage - can be used for testing + logger.info("Starting failed students check script") + + try: + from checkheaders import check_headers_predelibfile + + # Read the Excel file + logger.info("Reading predelib Excel file") + try: + df_predelib = pd.read_excel('db.xlsx') + logger.info(f"Successfully loaded predelib file with shape: {df_predelib.shape}") + except FileNotFoundError: + logger.error("db.xlsx file not found") + raise + except Exception as e: + logger.error(f"Error reading db.xlsx: {e}") + raise + + # Process the dataframe + logger.info("Processing predelib dataframe") + try: + processed_predelib_df = check_headers_predelibfile(df_predelib) + logger.info(f"Processed predelib dataframe shape: {processed_predelib_df.shape}") + except Exception as e: + logger.error(f"Error processing predelib file: {e}") + raise + + # Check for failed students + logger.info("Checking for failed students") + try: + students_with_fail_ar = check_students_with_fail_adviesrapport(processed_predelib_df) + logger.info(f"Failed students check completed. Found {len(students_with_fail_ar)} failed students.") + + # Print summary for console output + print_students_with_fail_ar_summary(students_with_fail_ar, processed_predelib_df) + + except Exception as e: + logger.error(f"Error during failed students check: {e}") + raise + + except ImportError as e: + logger.error(f"Import error: {e}") + print("Error: Could not import required modules. Make sure checkheaders.py is in the same directory.") + except Exception as e: + logger.error(f"Unexpected error in main execution: {e}") + print(f"An error occurred: {e}") + print("Check the log file 'predelib_processing.log' for detailed error information.") + finally: + logger.info("Failed students check script completed") \ No newline at end of file diff --git a/startpakketten/script.py b/startpakketten/script.py index 2603850..c510f27 100644 --- a/startpakketten/script.py +++ b/startpakketten/script.py @@ -6,6 +6,7 @@ import os from pathlib import Path from checkheaders import check_headers_dashboard_inschrijvingenfile, check_headers_predelibfile +from process_predelib_file import check_students_with_fail_adviesrapport, print_students_with_fail_ar_summary from compare_sp import compare_sp_values # Configure logging @@ -99,7 +100,11 @@ def process_files(predelib_path: str, dashboard_path: str, verbose: bool = False logger.info("Processing dashboard file headers") processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard) - + + # Check the predeliberation file for students with a fail in 'Adviesrapport code' + logger.info("Checking for students with FAIL status in predeliberation file") + students_with_fail = check_students_with_fail_adviesrapport(processed_predelib_df) + # Compare SP values logger.info("Comparing SP values between files") mismatches = compare_sp_values(processed_predelib_df, processed_dashboard_df) @@ -110,6 +115,8 @@ def process_files(predelib_path: str, dashboard_path: str, verbose: bool = False 'dashboard_file': dashboard_path, 'predelib_records': len(processed_predelib_df), 'dashboard_records': len(processed_dashboard_df), + 'students_with_fail_count': len(students_with_fail), + 'students_with_fail': students_with_fail, 'mismatches_count': len(mismatches), 'mismatches': mismatches, 'status': 'completed' @@ -144,12 +151,16 @@ def print_summary(results: dict): print(f"Dashboard file: {results['dashboard_file']}") print(f"Predelib records processed: {results['predelib_records']}") print(f"Dashboard records processed: {results['dashboard_records']}") + print(f"Students with FAIL adviesrapport found: {results['students_with_fail_count']}") print(f"Mismatches found: {results['mismatches_count']}") + if results['students_with_fail_count'] > 0: + print_students_with_fail_ar_summary(results['students_with_fail'], results['predelib_file']) + if results['mismatches']: - print(f"\nDetailed mismatches:") + print(f"\nDetailed mismatches between SP predeliberatierapport and Dashboard Inschrijvingen:") for mismatch in results['mismatches']: - print(f" ID {mismatch['ID']}: Predelib={mismatch['Predelib_SP']}, Dashboard={mismatch['Dashboard_SP']}") + print(f"Mismatch - ID {mismatch['ID']} ({mismatch['Name']}): Predeliberatierapport SP={mismatch['Predelib_SP']}, Dashboard Inschrijvingen SP={mismatch['Dashboard_SP']}") else: print("\n✅ All SP values match perfectly!")