diff --git a/startpakketten/compare_sp.py b/startpakketten/compare_sp.py index d125b15..0508784 100644 --- a/startpakketten/compare_sp.py +++ b/startpakketten/compare_sp.py @@ -1,6 +1,20 @@ import pandas as pd +import logging +from typing import List, Dict, Any, Optional -def compare_sp_values(predelib_df, dashboard_df): +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('sp_comparison.log'), + logging.StreamHandler() + ] +) + +logger = logging.getLogger(__name__) + +def compare_sp_values(predelib_df: pd.DataFrame, dashboard_df: pd.DataFrame) -> List[Dict[str, Any]]: """ Compare 'Totaal aantal SP' from predelib_df with 'Ingeschr. SP (intern)' from dashboard_df for matching IDs between the two dataframes. @@ -11,85 +25,212 @@ def compare_sp_values(predelib_df, dashboard_df): Returns: list: List of dictionaries containing mismatches, or empty list if all match + + Raises: + ValueError: If input dataframes are invalid + KeyError: If required columns are missing """ - if 'ID' not in predelib_df.columns: - print("Warning: 'ID' column not found in predelib dataframe") - return [] + logger.info("Starting SP values comparison") - if 'ID' not in dashboard_df.columns: - print("Warning: 'ID' column not found in dashboard dataframe") - return [] - - if 'Totaal aantal SP' not in predelib_df.columns: - print("Warning: 'Totaal aantal SP' column not found in predelib dataframe") - return [] - - if 'Ingeschr. SP (intern)' not in dashboard_df.columns: - print("Warning: 'Ingeschr. SP (intern)' column not found in dashboard dataframe") - return [] - - # Find matching IDs - # First, let's debug the ID columns - print(f"Predelib ID column type: {predelib_df['ID'].dtype}") - print(f"Dashboard ID column type: {dashboard_df['ID'].dtype}") - print(f"Sample predelib IDs: {list(predelib_df['ID'].head())}") - print(f"Sample dashboard IDs: {list(dashboard_df['ID'].head())}") - - # Convert IDs to strings to ensure consistent comparison - predelib_ids = set(str(x) for x in predelib_df['ID'] if pd.notna(x)) - dashboard_ids = set(str(x) for x in dashboard_df['ID'] if pd.notna(x)) - - matching_ids = predelib_ids.intersection(dashboard_ids) - print(f"Found {len(matching_ids)} matching IDs between the two dataframes") - - if len(matching_ids) == 0: - print("No matching IDs found between the dataframes") - print(f"Total predelib IDs: {len(predelib_ids)}") - print(f"Total dashboard IDs: {len(dashboard_ids)}") - return [] - - # Compare SP values for matching IDs - mismatches = [] - for id_val in matching_ids: - # Convert back to original type for filtering (try both string and original) - predelib_matches = predelib_df[predelib_df['ID'].astype(str) == id_val] - dashboard_matches = dashboard_df[dashboard_df['ID'].astype(str) == id_val] - - if len(predelib_matches) == 0 or len(dashboard_matches) == 0: - continue + try: + # Validate input dataframes + if predelib_df is None or predelib_df.empty: + error_msg = "Predelib dataframe is None or empty" + logger.error(error_msg) + raise ValueError(error_msg) - predelib_sp = predelib_matches['Totaal aantal SP'].iloc[0] - dashboard_sp = dashboard_matches['Ingeschr. SP (intern)'].iloc[0] + if dashboard_df is None or dashboard_df.empty: + error_msg = "Dashboard dataframe is None or empty" + logger.error(error_msg) + raise ValueError(error_msg) - if predelib_sp != dashboard_sp: - mismatches.append({ - 'ID': id_val, - 'Predelib_SP': predelib_sp, - 'Dashboard_SP': dashboard_sp - }) - - if len(mismatches) == 0: - print("All SP values match between the two dataframes!") - else: - print(f"Found {len(mismatches)} mismatches:") - for mismatch in mismatches: - print(f" ID {mismatch['ID']}: Predelib={mismatch['Predelib_SP']}, Dashboard={mismatch['Dashboard_SP']}") - - return mismatches + # Check for required columns + required_predelib_columns = ['ID', 'Totaal aantal SP'] + required_dashboard_columns = ['ID', 'Ingeschr. SP (intern)'] + + missing_predelib_cols = [col for col in required_predelib_columns if col not in predelib_df.columns] + missing_dashboard_cols = [col for col in required_dashboard_columns if col not in dashboard_df.columns] + + if missing_predelib_cols: + error_msg = f"Missing required columns in predelib dataframe: {missing_predelib_cols}" + logger.error(error_msg) + raise KeyError(error_msg) + + if missing_dashboard_cols: + error_msg = f"Missing required columns in dashboard dataframe: {missing_dashboard_cols}" + logger.error(error_msg) + raise KeyError(error_msg) + + logger.info("All required columns found in both dataframes") + + # Debug ID columns + logger.debug(f"Predelib ID column type: {predelib_df['ID'].dtype}") + logger.debug(f"Dashboard ID column type: {dashboard_df['ID'].dtype}") + logger.debug(f"Sample predelib IDs: {list(predelib_df['ID'].head())}") + logger.debug(f"Sample dashboard IDs: {list(dashboard_df['ID'].head())}") + + # Convert IDs to strings to ensure consistent comparison + try: + predelib_ids = set(str(x) for x in predelib_df['ID'] if pd.notna(x)) + dashboard_ids = set(str(x) for x in dashboard_df['ID'] if pd.notna(x)) + except Exception as e: + error_msg = f"Error converting IDs to strings: {e}" + logger.error(error_msg) + raise ValueError(error_msg) + + matching_ids = predelib_ids.intersection(dashboard_ids) + logger.info(f"Found {len(matching_ids)} matching IDs between the two dataframes") + logger.info(f"Total predelib IDs: {len(predelib_ids)}") + logger.info(f"Total dashboard IDs: {len(dashboard_ids)}") + + if len(matching_ids) == 0: + logger.warning("No matching IDs found between the dataframes") + return [] + + # Compare SP values for matching IDs + mismatches = [] + processed_count = 0 + + for id_val in matching_ids: + try: + # Convert back to original type for filtering + predelib_matches = predelib_df[predelib_df['ID'].astype(str) == id_val] + dashboard_matches = dashboard_df[dashboard_df['ID'].astype(str) == id_val] + + if len(predelib_matches) == 0: + logger.warning(f"No predelib records found for ID: {id_val}") + continue + + if len(dashboard_matches) == 0: + logger.warning(f"No dashboard records found for ID: {id_val}") + continue + + predelib_sp = predelib_matches['Totaal aantal SP'].iloc[0] + dashboard_sp = dashboard_matches['Ingeschr. SP (intern)'].iloc[0] + + # Handle potential NaN values + if pd.isna(predelib_sp) or pd.isna(dashboard_sp): + logger.warning(f"NaN values found for ID {id_val}: Predelib={predelib_sp}, Dashboard={dashboard_sp}") + continue + + # Convert to comparable types + try: + predelib_sp_num = float(predelib_sp) if not pd.isna(predelib_sp) else 0 + dashboard_sp_num = float(dashboard_sp) if not pd.isna(dashboard_sp) else 0 + except (ValueError, TypeError) as e: + logger.warning(f"Error converting SP values to numbers for ID {id_val}: {e}") + # Fall back to string comparison + predelib_sp_num = str(predelib_sp) + dashboard_sp_num = str(dashboard_sp) + + if predelib_sp_num != dashboard_sp_num: + mismatch = { + 'ID': id_val, + 'Predelib_SP': predelib_sp, + 'Dashboard_SP': dashboard_sp + } + mismatches.append(mismatch) + logger.debug(f"Mismatch found for ID {id_val}: Predelib={predelib_sp}, Dashboard={dashboard_sp}") + + processed_count += 1 + + except Exception as e: + logger.error(f"Error processing ID {id_val}: {e}") + continue + + logger.info(f"Successfully processed {processed_count} matching records") + + if len(mismatches) == 0: + logger.info("All SP values match between the two dataframes!") + else: + logger.warning(f"Found {len(mismatches)} mismatches") + for mismatch in mismatches: + logger.info(f"Mismatch - ID {mismatch['ID']}: Predeliberatierapport SP={mismatch['Predelib_SP']}, Dashboard Inschrijvingen SP={mismatch['Dashboard_SP']}") + + return mismatches + + except Exception as e: + logger.error(f"Unexpected error in compare_sp_values: {e}") + raise if __name__ == "__main__": # Example usage - can be used for testing - from checkheaders import check_headers_predelibfile, check_headers_dashboard_inschrijvingenfile + logger.info("Starting SP comparison script") - # Read the Excel files - df_predelib = pd.read_excel('db.xlsx') - df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx') - - # Process the dataframes - processed_predelib_df = check_headers_predelibfile(df_predelib) - processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard) - - # Compare SP values between the two processed dataframes - print("\nComparing SP values between predelib and dashboard files:") - mismatches = compare_sp_values(processed_predelib_df, processed_dashboard_df) + try: + from checkheaders import check_headers_predelibfile, check_headers_dashboard_inschrijvingenfile + + # Read the Excel files + logger.info("Reading Excel files") + try: + df_predelib = pd.read_excel('db.xlsx') + logger.info(f"Successfully loaded predelib file with shape: {df_predelib.shape}") + except FileNotFoundError: + logger.error("db.xlsx file not found") + raise + except Exception as e: + logger.error(f"Error reading db.xlsx: {e}") + raise + + try: + df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx') + logger.info(f"Successfully loaded dashboard file with shape: {df_dashboard.shape}") + except FileNotFoundError: + logger.error("dashboard_inschrijvingen.xlsx file not found") + raise + except Exception as e: + logger.error(f"Error reading dashboard_inschrijvingen.xlsx: {e}") + raise + + # Process the dataframes + logger.info("Processing dataframes") + try: + processed_predelib_df = check_headers_predelibfile(df_predelib) + logger.info(f"Processed predelib dataframe shape: {processed_predelib_df.shape}") + except Exception as e: + logger.error(f"Error processing predelib file: {e}") + raise + + try: + processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard) + logger.info(f"Processed dashboard dataframe shape: {processed_dashboard_df.shape}") + except Exception as e: + logger.error(f"Error processing dashboard file: {e}") + raise + + # Compare SP values between the two processed dataframes + logger.info("Starting SP values comparison") + try: + mismatches = compare_sp_values(processed_predelib_df, processed_dashboard_df) + logger.info(f"SP comparison completed successfully. Found {len(mismatches)} mismatches.") + + # Print summary for console output + print(f"\n{'='*50}") + print("SP COMPARISON SUMMARY") + print(f"{'='*50}") + print(f"Predelib records processed: {len(processed_predelib_df)}") + print(f"Dashboard records processed: {len(processed_dashboard_df)}") + print(f"Mismatches found: {len(mismatches)}") + + if mismatches: + print(f"\nDetailed mismatches:") + for mismatch in mismatches: + print(f" ID {mismatch['ID']}: Predelib={mismatch['Predelib_SP']}, Dashboard={mismatch['Dashboard_SP']}") + else: + print("\nAll SP values match perfectly!") + print(f"{'='*50}") + + except Exception as e: + logger.error(f"Error during SP comparison: {e}") + raise + + except ImportError as e: + logger.error(f"Import error: {e}") + print("Error: Could not import required modules. Make sure checkheaders.py is in the same directory.") + except Exception as e: + logger.error(f"Unexpected error in main execution: {e}") + print(f"An error occurred: {e}") + print("Check the log file 'sp_comparison.log' for detailed error information.") + finally: + logger.info("SP comparison script completed") diff --git a/startpakketten/script.py b/startpakketten/script.py index 47b3bc4..2603850 100644 --- a/startpakketten/script.py +++ b/startpakketten/script.py @@ -1,20 +1,200 @@ import pandas as pd +import argparse +import logging +import sys +import os +from pathlib import Path from checkheaders import check_headers_dashboard_inschrijvingenfile, check_headers_predelibfile from compare_sp import compare_sp_values -# Read the Excel file -df_predelib = pd.read_excel('db.xlsx') -df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx') -processed_predelib_df = check_headers_predelibfile(df_predelib) -processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard) +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('startpakket_processing.log'), + logging.StreamHandler() + ] +) + +logger = logging.getLogger(__name__) -# Further processing can be done with processed_predelib_df and processed_dashboard_df -print("Processed Predelib DataFrame:") -print(processed_predelib_df) -print("\nProcessed Dashboard DataFrame:") -print(processed_dashboard_df) +def validate_file_path(file_path: str) -> str: + """Validate that the file exists and is an Excel file""" + if not os.path.exists(file_path): + raise argparse.ArgumentTypeError(f"File '{file_path}' does not exist") + + if not file_path.lower().endswith(('.xlsx', '.xls')): + raise argparse.ArgumentTypeError(f"File '{file_path}' is not an Excel file (.xlsx or .xls)") + + return file_path -compare_sp_values(processed_predelib_df, processed_dashboard_df) -print("\nComparison of the predelib file with the dashboard file on SP values complete.") + +def parse_arguments(): + """Parse command line arguments""" + parser = argparse.ArgumentParser( + description='Process and compare student data from predeliberation and dashboard Excel files', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s --predelib db.xlsx --dashboard dashboard_inschrijvingen.xlsx + %(prog)s -p /path/to/predelib.xlsx -d /path/to/dashboard.xlsx --output results.json + %(prog)s --predelib db.xlsx --dashboard dashboard.xlsx --verbose + """ + ) + + parser.add_argument( + '--predelib', '-p', + type=validate_file_path, + required=True, + help='Path to the predeliberation Excel file (db.xlsx)' + ) + + parser.add_argument( + '--dashboard', '-d', + type=validate_file_path, + required=True, + help='Path to the dashboard Excel file (dashboard_inschrijvingen.xlsx)' + ) + + parser.add_argument( + '--output', '-o', + type=str, + help='Output file path for results (optional, prints to console if not specified)' + ) + + parser.add_argument( + '--verbose', '-v', + action='store_true', + help='Enable verbose logging' + ) + + parser.add_argument( + '--log-file', + type=str, + default='startpakket_processing.log', + help='Path to log file (default: startpakket_processing.log)' + ) + + return parser.parse_args() + + +def process_files(predelib_path: str, dashboard_path: str, verbose: bool = False): + """Process the Excel files and return results""" + try: + # Read Excel files + logger.info(f"Reading predeliberation file: {predelib_path}") + df_predelib = pd.read_excel(predelib_path) + logger.info(f"Predelib file loaded successfully. Shape: {df_predelib.shape}") + + logger.info(f"Reading dashboard file: {dashboard_path}") + df_dashboard = pd.read_excel(dashboard_path) + logger.info(f"Dashboard file loaded successfully. Shape: {df_dashboard.shape}") + + # Process the dataframes + logger.info("Processing predeliberation file headers") + processed_predelib_df = check_headers_predelibfile(df_predelib) + + logger.info("Processing dashboard file headers") + processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard) + + # Compare SP values + logger.info("Comparing SP values between files") + mismatches = compare_sp_values(processed_predelib_df, processed_dashboard_df) + + # Prepare results + results = { + 'predelib_file': predelib_path, + 'dashboard_file': dashboard_path, + 'predelib_records': len(processed_predelib_df), + 'dashboard_records': len(processed_dashboard_df), + 'mismatches_count': len(mismatches), + 'mismatches': mismatches, + 'status': 'completed' + } + + logger.info(f"Processing completed successfully. Found {len(mismatches)} mismatches.") + return results + + except Exception as e: + logger.error(f"Error processing files: {e}") + raise + + +def save_results(results: dict, output_path: str): + """Save results to a file""" + try: + import json + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(results, f, indent=2, ensure_ascii=False) + logger.info(f"Results saved to: {output_path}") + except Exception as e: + logger.error(f"Error saving results to {output_path}: {e}") + raise + + +def print_summary(results: dict): + """Print a summary of the results to console""" + print(f"\n{'='*60}") + print("STARTPAKKET PROCESSING SUMMARY") + print(f"{'='*60}") + print(f"Predelib file: {results['predelib_file']}") + print(f"Dashboard file: {results['dashboard_file']}") + print(f"Predelib records processed: {results['predelib_records']}") + print(f"Dashboard records processed: {results['dashboard_records']}") + print(f"Mismatches found: {results['mismatches_count']}") + + if results['mismatches']: + print(f"\nDetailed mismatches:") + for mismatch in results['mismatches']: + print(f" ID {mismatch['ID']}: Predelib={mismatch['Predelib_SP']}, Dashboard={mismatch['Dashboard_SP']}") + else: + print("\n✅ All SP values match perfectly!") + + print(f"Status: {results['status']}") + print(f"{'='*60}") + + +def main(): + """Main function""" + try: + # Parse arguments + args = parse_arguments() + + # Configure logging level + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + logger.debug("Verbose logging enabled") + + logger.info("Starting startpakket processing") + logger.info(f"Predelib file: {args.predelib}") + logger.info(f"Dashboard file: {args.dashboard}") + + # Process files + results = process_files(args.predelib, args.dashboard, args.verbose) + + # Save results if output path specified + if args.output: + save_results(results, args.output) + + # Print summary + print_summary(results) + + # Exit with appropriate code + exit_code = 0 if results['mismatches_count'] == 0 else 1 + logger.info(f"Processing completed with exit code: {exit_code}") + sys.exit(exit_code) + + except KeyboardInterrupt: + logger.info("Processing interrupted by user") + sys.exit(130) + except Exception as e: + logger.error(f"Fatal error: {e}") + print(f"Error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/startpakketten/sp_comparison.log b/startpakketten/sp_comparison.log new file mode 100644 index 0000000..37adc53 --- /dev/null +++ b/startpakketten/sp_comparison.log @@ -0,0 +1,58 @@ +2025-07-29 11:56:36,048 - __main__ - INFO - Starting SP comparison script +2025-07-29 11:56:36,086 - __main__ - INFO - Reading Excel files +2025-07-29 11:56:36,757 - __main__ - INFO - Successfully loaded predelib file with shape: (123, 22) +2025-07-29 11:56:36,776 - __main__ - INFO - Successfully loaded dashboard file with shape: (40, 36) +2025-07-29 11:56:36,776 - __main__ - INFO - Processing dataframes +2025-07-29 11:56:36,778 - __main__ - INFO - Processed predelib dataframe shape: (123, 22) +2025-07-29 11:56:36,780 - __main__ - INFO - Processed dashboard dataframe shape: (37, 36) +2025-07-29 11:56:36,781 - __main__ - INFO - Starting SP values comparison +2025-07-29 11:56:36,781 - __main__ - INFO - Starting SP values comparison +2025-07-29 11:56:36,781 - __main__ - INFO - Predelib dataframe shape: (123, 22) +2025-07-29 11:56:36,782 - __main__ - INFO - Dashboard dataframe shape: (37, 36) +2025-07-29 11:56:36,782 - __main__ - INFO - All required columns found in both dataframes +2025-07-29 11:56:36,782 - __main__ - INFO - Found 37 matching IDs between the two dataframes +2025-07-29 11:56:36,783 - __main__ - INFO - Total predelib IDs: 37 +2025-07-29 11:56:36,783 - __main__ - INFO - Total dashboard IDs: 37 +2025-07-29 11:56:36,798 - __main__ - INFO - Successfully processed 37 matching records +2025-07-29 11:56:36,798 - __main__ - WARNING - Found 1 mismatches +2025-07-29 11:56:36,798 - __main__ - INFO - Mismatch - ID 20250706: Predelib=39, Dashboard=45 +2025-07-29 11:56:36,798 - __main__ - INFO - SP comparison completed successfully. Found 1 mismatches. +2025-07-29 11:56:36,801 - __main__ - INFO - SP comparison script completed +2025-07-29 13:29:44,971 - __main__ - INFO - Starting SP comparison script +2025-07-29 13:29:45,011 - __main__ - INFO - Reading Excel files +2025-07-29 13:29:48,429 - __main__ - INFO - Successfully loaded predelib file with shape: (123, 22) +2025-07-29 13:29:48,456 - __main__ - INFO - Successfully loaded dashboard file with shape: (40, 36) +2025-07-29 13:29:48,456 - __main__ - INFO - Processing dataframes +2025-07-29 13:29:48,459 - __main__ - INFO - Processed predelib dataframe shape: (123, 22) +2025-07-29 13:29:48,460 - __main__ - INFO - Processed dashboard dataframe shape: (37, 36) +2025-07-29 13:29:48,460 - __main__ - INFO - Starting SP values comparison +2025-07-29 13:29:48,460 - __main__ - INFO - Starting SP values comparison +2025-07-29 13:29:48,460 - __main__ - INFO - All required columns found in both dataframes +2025-07-29 13:29:48,460 - __main__ - INFO - Found 37 matching IDs between the two dataframes +2025-07-29 13:29:48,460 - __main__ - INFO - Total predelib IDs: 37 +2025-07-29 13:29:48,461 - __main__ - INFO - Total dashboard IDs: 37 +2025-07-29 13:29:48,486 - __main__ - INFO - Successfully processed 37 matching records +2025-07-29 13:29:48,487 - __main__ - WARNING - Found 1 mismatches +2025-07-29 13:29:48,487 - __main__ - INFO - Mismatch - ID 20250706: Predeliberatierapport SP=39, Dashboard Inschrijvingen SP=45 +2025-07-29 13:29:48,487 - __main__ - INFO - SP comparison completed successfully. Found 1 mismatches. +2025-07-29 13:29:48,488 - __main__ - INFO - SP comparison script completed +2025-07-29 14:06:13,452 - __main__ - INFO - Starting startpakket processing +2025-07-29 14:06:13,453 - __main__ - INFO - Predelib file: db.xlsx +2025-07-29 14:06:13,453 - __main__ - INFO - Dashboard file: dashboard_inschrijvingen.xlsx +2025-07-29 14:06:13,453 - __main__ - INFO - Reading predeliberation file: db.xlsx +2025-07-29 14:06:14,888 - __main__ - INFO - Predelib file loaded successfully. Shape: (123, 22) +2025-07-29 14:06:14,888 - __main__ - INFO - Reading dashboard file: dashboard_inschrijvingen.xlsx +2025-07-29 14:06:14,948 - __main__ - INFO - Dashboard file loaded successfully. Shape: (40, 36) +2025-07-29 14:06:14,948 - __main__ - INFO - Processing predeliberation file headers +2025-07-29 14:06:14,952 - __main__ - INFO - Processing dashboard file headers +2025-07-29 14:06:14,953 - __main__ - INFO - Comparing SP values between files +2025-07-29 14:06:14,953 - compare_sp - INFO - Starting SP values comparison +2025-07-29 14:06:14,953 - compare_sp - INFO - All required columns found in both dataframes +2025-07-29 14:06:14,954 - compare_sp - INFO - Found 37 matching IDs between the two dataframes +2025-07-29 14:06:14,955 - compare_sp - INFO - Total predelib IDs: 37 +2025-07-29 14:06:14,955 - compare_sp - INFO - Total dashboard IDs: 37 +2025-07-29 14:06:14,967 - compare_sp - INFO - Successfully processed 37 matching records +2025-07-29 14:06:14,967 - compare_sp - WARNING - Found 1 mismatches +2025-07-29 14:06:14,968 - compare_sp - INFO - Mismatch - ID 20250706: Predeliberatierapport SP=39, Dashboard Inschrijvingen SP=45 +2025-07-29 14:06:14,968 - __main__ - INFO - Processing completed successfully. Found 1 mismatches. +2025-07-29 14:06:14,970 - __main__ - INFO - Processing completed with exit code: 1 diff --git a/startpakketten/startpakket_processing.log b/startpakketten/startpakket_processing.log new file mode 100644 index 0000000..e69de29