Enhance SP comparison script with logging, error handling, and command-line argument parsing

This commit is contained in:
bdaneels 2025-07-29 14:07:38 +02:00
parent 248417c4b8
commit c5d356b366
4 changed files with 465 additions and 86 deletions

View File

@ -1,6 +1,20 @@
import pandas as pd
import logging
from typing import List, Dict, Any, Optional
def compare_sp_values(predelib_df, dashboard_df):
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('sp_comparison.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def compare_sp_values(predelib_df: pd.DataFrame, dashboard_df: pd.DataFrame) -> List[Dict[str, Any]]:
"""
Compare 'Totaal aantal SP' from predelib_df with 'Ingeschr. SP (intern)' from dashboard_df
for matching IDs between the two dataframes.
@ -11,85 +25,212 @@ def compare_sp_values(predelib_df, dashboard_df):
Returns:
list: List of dictionaries containing mismatches, or empty list if all match
Raises:
ValueError: If input dataframes are invalid
KeyError: If required columns are missing
"""
if 'ID' not in predelib_df.columns:
print("Warning: 'ID' column not found in predelib dataframe")
return []
logger.info("Starting SP values comparison")
if 'ID' not in dashboard_df.columns:
print("Warning: 'ID' column not found in dashboard dataframe")
return []
if 'Totaal aantal SP' not in predelib_df.columns:
print("Warning: 'Totaal aantal SP' column not found in predelib dataframe")
return []
if 'Ingeschr. SP (intern)' not in dashboard_df.columns:
print("Warning: 'Ingeschr. SP (intern)' column not found in dashboard dataframe")
return []
# Find matching IDs
# First, let's debug the ID columns
print(f"Predelib ID column type: {predelib_df['ID'].dtype}")
print(f"Dashboard ID column type: {dashboard_df['ID'].dtype}")
print(f"Sample predelib IDs: {list(predelib_df['ID'].head())}")
print(f"Sample dashboard IDs: {list(dashboard_df['ID'].head())}")
# Convert IDs to strings to ensure consistent comparison
predelib_ids = set(str(x) for x in predelib_df['ID'] if pd.notna(x))
dashboard_ids = set(str(x) for x in dashboard_df['ID'] if pd.notna(x))
matching_ids = predelib_ids.intersection(dashboard_ids)
print(f"Found {len(matching_ids)} matching IDs between the two dataframes")
if len(matching_ids) == 0:
print("No matching IDs found between the dataframes")
print(f"Total predelib IDs: {len(predelib_ids)}")
print(f"Total dashboard IDs: {len(dashboard_ids)}")
return []
# Compare SP values for matching IDs
mismatches = []
for id_val in matching_ids:
# Convert back to original type for filtering (try both string and original)
predelib_matches = predelib_df[predelib_df['ID'].astype(str) == id_val]
dashboard_matches = dashboard_df[dashboard_df['ID'].astype(str) == id_val]
if len(predelib_matches) == 0 or len(dashboard_matches) == 0:
continue
try:
# Validate input dataframes
if predelib_df is None or predelib_df.empty:
error_msg = "Predelib dataframe is None or empty"
logger.error(error_msg)
raise ValueError(error_msg)
predelib_sp = predelib_matches['Totaal aantal SP'].iloc[0]
dashboard_sp = dashboard_matches['Ingeschr. SP (intern)'].iloc[0]
if dashboard_df is None or dashboard_df.empty:
error_msg = "Dashboard dataframe is None or empty"
logger.error(error_msg)
raise ValueError(error_msg)
if predelib_sp != dashboard_sp:
mismatches.append({
'ID': id_val,
'Predelib_SP': predelib_sp,
'Dashboard_SP': dashboard_sp
})
if len(mismatches) == 0:
print("All SP values match between the two dataframes!")
else:
print(f"Found {len(mismatches)} mismatches:")
for mismatch in mismatches:
print(f" ID {mismatch['ID']}: Predelib={mismatch['Predelib_SP']}, Dashboard={mismatch['Dashboard_SP']}")
return mismatches
# Check for required columns
required_predelib_columns = ['ID', 'Totaal aantal SP']
required_dashboard_columns = ['ID', 'Ingeschr. SP (intern)']
missing_predelib_cols = [col for col in required_predelib_columns if col not in predelib_df.columns]
missing_dashboard_cols = [col for col in required_dashboard_columns if col not in dashboard_df.columns]
if missing_predelib_cols:
error_msg = f"Missing required columns in predelib dataframe: {missing_predelib_cols}"
logger.error(error_msg)
raise KeyError(error_msg)
if missing_dashboard_cols:
error_msg = f"Missing required columns in dashboard dataframe: {missing_dashboard_cols}"
logger.error(error_msg)
raise KeyError(error_msg)
logger.info("All required columns found in both dataframes")
# Debug ID columns
logger.debug(f"Predelib ID column type: {predelib_df['ID'].dtype}")
logger.debug(f"Dashboard ID column type: {dashboard_df['ID'].dtype}")
logger.debug(f"Sample predelib IDs: {list(predelib_df['ID'].head())}")
logger.debug(f"Sample dashboard IDs: {list(dashboard_df['ID'].head())}")
# Convert IDs to strings to ensure consistent comparison
try:
predelib_ids = set(str(x) for x in predelib_df['ID'] if pd.notna(x))
dashboard_ids = set(str(x) for x in dashboard_df['ID'] if pd.notna(x))
except Exception as e:
error_msg = f"Error converting IDs to strings: {e}"
logger.error(error_msg)
raise ValueError(error_msg)
matching_ids = predelib_ids.intersection(dashboard_ids)
logger.info(f"Found {len(matching_ids)} matching IDs between the two dataframes")
logger.info(f"Total predelib IDs: {len(predelib_ids)}")
logger.info(f"Total dashboard IDs: {len(dashboard_ids)}")
if len(matching_ids) == 0:
logger.warning("No matching IDs found between the dataframes")
return []
# Compare SP values for matching IDs
mismatches = []
processed_count = 0
for id_val in matching_ids:
try:
# Convert back to original type for filtering
predelib_matches = predelib_df[predelib_df['ID'].astype(str) == id_val]
dashboard_matches = dashboard_df[dashboard_df['ID'].astype(str) == id_val]
if len(predelib_matches) == 0:
logger.warning(f"No predelib records found for ID: {id_val}")
continue
if len(dashboard_matches) == 0:
logger.warning(f"No dashboard records found for ID: {id_val}")
continue
predelib_sp = predelib_matches['Totaal aantal SP'].iloc[0]
dashboard_sp = dashboard_matches['Ingeschr. SP (intern)'].iloc[0]
# Handle potential NaN values
if pd.isna(predelib_sp) or pd.isna(dashboard_sp):
logger.warning(f"NaN values found for ID {id_val}: Predelib={predelib_sp}, Dashboard={dashboard_sp}")
continue
# Convert to comparable types
try:
predelib_sp_num = float(predelib_sp) if not pd.isna(predelib_sp) else 0
dashboard_sp_num = float(dashboard_sp) if not pd.isna(dashboard_sp) else 0
except (ValueError, TypeError) as e:
logger.warning(f"Error converting SP values to numbers for ID {id_val}: {e}")
# Fall back to string comparison
predelib_sp_num = str(predelib_sp)
dashboard_sp_num = str(dashboard_sp)
if predelib_sp_num != dashboard_sp_num:
mismatch = {
'ID': id_val,
'Predelib_SP': predelib_sp,
'Dashboard_SP': dashboard_sp
}
mismatches.append(mismatch)
logger.debug(f"Mismatch found for ID {id_val}: Predelib={predelib_sp}, Dashboard={dashboard_sp}")
processed_count += 1
except Exception as e:
logger.error(f"Error processing ID {id_val}: {e}")
continue
logger.info(f"Successfully processed {processed_count} matching records")
if len(mismatches) == 0:
logger.info("All SP values match between the two dataframes!")
else:
logger.warning(f"Found {len(mismatches)} mismatches")
for mismatch in mismatches:
logger.info(f"Mismatch - ID {mismatch['ID']}: Predeliberatierapport SP={mismatch['Predelib_SP']}, Dashboard Inschrijvingen SP={mismatch['Dashboard_SP']}")
return mismatches
except Exception as e:
logger.error(f"Unexpected error in compare_sp_values: {e}")
raise
if __name__ == "__main__":
# Example usage - can be used for testing
from checkheaders import check_headers_predelibfile, check_headers_dashboard_inschrijvingenfile
logger.info("Starting SP comparison script")
# Read the Excel files
df_predelib = pd.read_excel('db.xlsx')
df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx')
# Process the dataframes
processed_predelib_df = check_headers_predelibfile(df_predelib)
processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard)
# Compare SP values between the two processed dataframes
print("\nComparing SP values between predelib and dashboard files:")
mismatches = compare_sp_values(processed_predelib_df, processed_dashboard_df)
try:
from checkheaders import check_headers_predelibfile, check_headers_dashboard_inschrijvingenfile
# Read the Excel files
logger.info("Reading Excel files")
try:
df_predelib = pd.read_excel('db.xlsx')
logger.info(f"Successfully loaded predelib file with shape: {df_predelib.shape}")
except FileNotFoundError:
logger.error("db.xlsx file not found")
raise
except Exception as e:
logger.error(f"Error reading db.xlsx: {e}")
raise
try:
df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx')
logger.info(f"Successfully loaded dashboard file with shape: {df_dashboard.shape}")
except FileNotFoundError:
logger.error("dashboard_inschrijvingen.xlsx file not found")
raise
except Exception as e:
logger.error(f"Error reading dashboard_inschrijvingen.xlsx: {e}")
raise
# Process the dataframes
logger.info("Processing dataframes")
try:
processed_predelib_df = check_headers_predelibfile(df_predelib)
logger.info(f"Processed predelib dataframe shape: {processed_predelib_df.shape}")
except Exception as e:
logger.error(f"Error processing predelib file: {e}")
raise
try:
processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard)
logger.info(f"Processed dashboard dataframe shape: {processed_dashboard_df.shape}")
except Exception as e:
logger.error(f"Error processing dashboard file: {e}")
raise
# Compare SP values between the two processed dataframes
logger.info("Starting SP values comparison")
try:
mismatches = compare_sp_values(processed_predelib_df, processed_dashboard_df)
logger.info(f"SP comparison completed successfully. Found {len(mismatches)} mismatches.")
# Print summary for console output
print(f"\n{'='*50}")
print("SP COMPARISON SUMMARY")
print(f"{'='*50}")
print(f"Predelib records processed: {len(processed_predelib_df)}")
print(f"Dashboard records processed: {len(processed_dashboard_df)}")
print(f"Mismatches found: {len(mismatches)}")
if mismatches:
print(f"\nDetailed mismatches:")
for mismatch in mismatches:
print(f" ID {mismatch['ID']}: Predelib={mismatch['Predelib_SP']}, Dashboard={mismatch['Dashboard_SP']}")
else:
print("\nAll SP values match perfectly!")
print(f"{'='*50}")
except Exception as e:
logger.error(f"Error during SP comparison: {e}")
raise
except ImportError as e:
logger.error(f"Import error: {e}")
print("Error: Could not import required modules. Make sure checkheaders.py is in the same directory.")
except Exception as e:
logger.error(f"Unexpected error in main execution: {e}")
print(f"An error occurred: {e}")
print("Check the log file 'sp_comparison.log' for detailed error information.")
finally:
logger.info("SP comparison script completed")

View File

@ -1,20 +1,200 @@
import pandas as pd
import argparse
import logging
import sys
import os
from pathlib import Path
from checkheaders import check_headers_dashboard_inschrijvingenfile, check_headers_predelibfile
from compare_sp import compare_sp_values
# Read the Excel file
df_predelib = pd.read_excel('db.xlsx')
df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx')
processed_predelib_df = check_headers_predelibfile(df_predelib)
processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard)
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('startpakket_processing.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# Further processing can be done with processed_predelib_df and processed_dashboard_df
print("Processed Predelib DataFrame:")
print(processed_predelib_df)
print("\nProcessed Dashboard DataFrame:")
print(processed_dashboard_df)
def validate_file_path(file_path: str) -> str:
"""Validate that the file exists and is an Excel file"""
if not os.path.exists(file_path):
raise argparse.ArgumentTypeError(f"File '{file_path}' does not exist")
if not file_path.lower().endswith(('.xlsx', '.xls')):
raise argparse.ArgumentTypeError(f"File '{file_path}' is not an Excel file (.xlsx or .xls)")
return file_path
compare_sp_values(processed_predelib_df, processed_dashboard_df)
print("\nComparison of the predelib file with the dashboard file on SP values complete.")
def parse_arguments():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(
description='Process and compare student data from predeliberation and dashboard Excel files',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s --predelib db.xlsx --dashboard dashboard_inschrijvingen.xlsx
%(prog)s -p /path/to/predelib.xlsx -d /path/to/dashboard.xlsx --output results.json
%(prog)s --predelib db.xlsx --dashboard dashboard.xlsx --verbose
"""
)
parser.add_argument(
'--predelib', '-p',
type=validate_file_path,
required=True,
help='Path to the predeliberation Excel file (db.xlsx)'
)
parser.add_argument(
'--dashboard', '-d',
type=validate_file_path,
required=True,
help='Path to the dashboard Excel file (dashboard_inschrijvingen.xlsx)'
)
parser.add_argument(
'--output', '-o',
type=str,
help='Output file path for results (optional, prints to console if not specified)'
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Enable verbose logging'
)
parser.add_argument(
'--log-file',
type=str,
default='startpakket_processing.log',
help='Path to log file (default: startpakket_processing.log)'
)
return parser.parse_args()
def process_files(predelib_path: str, dashboard_path: str, verbose: bool = False):
"""Process the Excel files and return results"""
try:
# Read Excel files
logger.info(f"Reading predeliberation file: {predelib_path}")
df_predelib = pd.read_excel(predelib_path)
logger.info(f"Predelib file loaded successfully. Shape: {df_predelib.shape}")
logger.info(f"Reading dashboard file: {dashboard_path}")
df_dashboard = pd.read_excel(dashboard_path)
logger.info(f"Dashboard file loaded successfully. Shape: {df_dashboard.shape}")
# Process the dataframes
logger.info("Processing predeliberation file headers")
processed_predelib_df = check_headers_predelibfile(df_predelib)
logger.info("Processing dashboard file headers")
processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard)
# Compare SP values
logger.info("Comparing SP values between files")
mismatches = compare_sp_values(processed_predelib_df, processed_dashboard_df)
# Prepare results
results = {
'predelib_file': predelib_path,
'dashboard_file': dashboard_path,
'predelib_records': len(processed_predelib_df),
'dashboard_records': len(processed_dashboard_df),
'mismatches_count': len(mismatches),
'mismatches': mismatches,
'status': 'completed'
}
logger.info(f"Processing completed successfully. Found {len(mismatches)} mismatches.")
return results
except Exception as e:
logger.error(f"Error processing files: {e}")
raise
def save_results(results: dict, output_path: str):
"""Save results to a file"""
try:
import json
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
logger.info(f"Results saved to: {output_path}")
except Exception as e:
logger.error(f"Error saving results to {output_path}: {e}")
raise
def print_summary(results: dict):
"""Print a summary of the results to console"""
print(f"\n{'='*60}")
print("STARTPAKKET PROCESSING SUMMARY")
print(f"{'='*60}")
print(f"Predelib file: {results['predelib_file']}")
print(f"Dashboard file: {results['dashboard_file']}")
print(f"Predelib records processed: {results['predelib_records']}")
print(f"Dashboard records processed: {results['dashboard_records']}")
print(f"Mismatches found: {results['mismatches_count']}")
if results['mismatches']:
print(f"\nDetailed mismatches:")
for mismatch in results['mismatches']:
print(f" ID {mismatch['ID']}: Predelib={mismatch['Predelib_SP']}, Dashboard={mismatch['Dashboard_SP']}")
else:
print("\n✅ All SP values match perfectly!")
print(f"Status: {results['status']}")
print(f"{'='*60}")
def main():
"""Main function"""
try:
# Parse arguments
args = parse_arguments()
# Configure logging level
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
logger.debug("Verbose logging enabled")
logger.info("Starting startpakket processing")
logger.info(f"Predelib file: {args.predelib}")
logger.info(f"Dashboard file: {args.dashboard}")
# Process files
results = process_files(args.predelib, args.dashboard, args.verbose)
# Save results if output path specified
if args.output:
save_results(results, args.output)
# Print summary
print_summary(results)
# Exit with appropriate code
exit_code = 0 if results['mismatches_count'] == 0 else 1
logger.info(f"Processing completed with exit code: {exit_code}")
sys.exit(exit_code)
except KeyboardInterrupt:
logger.info("Processing interrupted by user")
sys.exit(130)
except Exception as e:
logger.error(f"Fatal error: {e}")
print(f"Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,58 @@
2025-07-29 11:56:36,048 - __main__ - INFO - Starting SP comparison script
2025-07-29 11:56:36,086 - __main__ - INFO - Reading Excel files
2025-07-29 11:56:36,757 - __main__ - INFO - Successfully loaded predelib file with shape: (123, 22)
2025-07-29 11:56:36,776 - __main__ - INFO - Successfully loaded dashboard file with shape: (40, 36)
2025-07-29 11:56:36,776 - __main__ - INFO - Processing dataframes
2025-07-29 11:56:36,778 - __main__ - INFO - Processed predelib dataframe shape: (123, 22)
2025-07-29 11:56:36,780 - __main__ - INFO - Processed dashboard dataframe shape: (37, 36)
2025-07-29 11:56:36,781 - __main__ - INFO - Starting SP values comparison
2025-07-29 11:56:36,781 - __main__ - INFO - Starting SP values comparison
2025-07-29 11:56:36,781 - __main__ - INFO - Predelib dataframe shape: (123, 22)
2025-07-29 11:56:36,782 - __main__ - INFO - Dashboard dataframe shape: (37, 36)
2025-07-29 11:56:36,782 - __main__ - INFO - All required columns found in both dataframes
2025-07-29 11:56:36,782 - __main__ - INFO - Found 37 matching IDs between the two dataframes
2025-07-29 11:56:36,783 - __main__ - INFO - Total predelib IDs: 37
2025-07-29 11:56:36,783 - __main__ - INFO - Total dashboard IDs: 37
2025-07-29 11:56:36,798 - __main__ - INFO - Successfully processed 37 matching records
2025-07-29 11:56:36,798 - __main__ - WARNING - Found 1 mismatches
2025-07-29 11:56:36,798 - __main__ - INFO - Mismatch - ID 20250706: Predelib=39, Dashboard=45
2025-07-29 11:56:36,798 - __main__ - INFO - SP comparison completed successfully. Found 1 mismatches.
2025-07-29 11:56:36,801 - __main__ - INFO - SP comparison script completed
2025-07-29 13:29:44,971 - __main__ - INFO - Starting SP comparison script
2025-07-29 13:29:45,011 - __main__ - INFO - Reading Excel files
2025-07-29 13:29:48,429 - __main__ - INFO - Successfully loaded predelib file with shape: (123, 22)
2025-07-29 13:29:48,456 - __main__ - INFO - Successfully loaded dashboard file with shape: (40, 36)
2025-07-29 13:29:48,456 - __main__ - INFO - Processing dataframes
2025-07-29 13:29:48,459 - __main__ - INFO - Processed predelib dataframe shape: (123, 22)
2025-07-29 13:29:48,460 - __main__ - INFO - Processed dashboard dataframe shape: (37, 36)
2025-07-29 13:29:48,460 - __main__ - INFO - Starting SP values comparison
2025-07-29 13:29:48,460 - __main__ - INFO - Starting SP values comparison
2025-07-29 13:29:48,460 - __main__ - INFO - All required columns found in both dataframes
2025-07-29 13:29:48,460 - __main__ - INFO - Found 37 matching IDs between the two dataframes
2025-07-29 13:29:48,460 - __main__ - INFO - Total predelib IDs: 37
2025-07-29 13:29:48,461 - __main__ - INFO - Total dashboard IDs: 37
2025-07-29 13:29:48,486 - __main__ - INFO - Successfully processed 37 matching records
2025-07-29 13:29:48,487 - __main__ - WARNING - Found 1 mismatches
2025-07-29 13:29:48,487 - __main__ - INFO - Mismatch - ID 20250706: Predeliberatierapport SP=39, Dashboard Inschrijvingen SP=45
2025-07-29 13:29:48,487 - __main__ - INFO - SP comparison completed successfully. Found 1 mismatches.
2025-07-29 13:29:48,488 - __main__ - INFO - SP comparison script completed
2025-07-29 14:06:13,452 - __main__ - INFO - Starting startpakket processing
2025-07-29 14:06:13,453 - __main__ - INFO - Predelib file: db.xlsx
2025-07-29 14:06:13,453 - __main__ - INFO - Dashboard file: dashboard_inschrijvingen.xlsx
2025-07-29 14:06:13,453 - __main__ - INFO - Reading predeliberation file: db.xlsx
2025-07-29 14:06:14,888 - __main__ - INFO - Predelib file loaded successfully. Shape: (123, 22)
2025-07-29 14:06:14,888 - __main__ - INFO - Reading dashboard file: dashboard_inschrijvingen.xlsx
2025-07-29 14:06:14,948 - __main__ - INFO - Dashboard file loaded successfully. Shape: (40, 36)
2025-07-29 14:06:14,948 - __main__ - INFO - Processing predeliberation file headers
2025-07-29 14:06:14,952 - __main__ - INFO - Processing dashboard file headers
2025-07-29 14:06:14,953 - __main__ - INFO - Comparing SP values between files
2025-07-29 14:06:14,953 - compare_sp - INFO - Starting SP values comparison
2025-07-29 14:06:14,953 - compare_sp - INFO - All required columns found in both dataframes
2025-07-29 14:06:14,954 - compare_sp - INFO - Found 37 matching IDs between the two dataframes
2025-07-29 14:06:14,955 - compare_sp - INFO - Total predelib IDs: 37
2025-07-29 14:06:14,955 - compare_sp - INFO - Total dashboard IDs: 37
2025-07-29 14:06:14,967 - compare_sp - INFO - Successfully processed 37 matching records
2025-07-29 14:06:14,967 - compare_sp - WARNING - Found 1 mismatches
2025-07-29 14:06:14,968 - compare_sp - INFO - Mismatch - ID 20250706: Predeliberatierapport SP=39, Dashboard Inschrijvingen SP=45
2025-07-29 14:06:14,968 - __main__ - INFO - Processing completed successfully. Found 1 mismatches.
2025-07-29 14:06:14,970 - __main__ - INFO - Processing completed with exit code: 1