240 lines
10 KiB
Python
240 lines
10 KiB
Python
import pandas as pd
|
|
import logging
|
|
from typing import List, Dict, Any, Optional
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('sp_comparison.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def compare_sp_values(predelib_df: pd.DataFrame, dashboard_df: pd.DataFrame) -> List[Dict[str, Any]]:
|
|
"""
|
|
Compare 'Totaal aantal SP' from predelib_df with 'Ingeschr. SP (intern)' from dashboard_df
|
|
for matching IDs between the two dataframes.
|
|
|
|
Args:
|
|
predelib_df (pandas.DataFrame): Dataframe from predeliberation file with 'ID' and 'Totaal aantal SP' columns
|
|
dashboard_df (pandas.DataFrame): Dataframe from dashboard file with 'ID' and 'Ingeschr. SP (intern)' columns
|
|
|
|
Returns:
|
|
list: List of dictionaries containing mismatches, or empty list if all match
|
|
|
|
Raises:
|
|
ValueError: If input dataframes are invalid
|
|
KeyError: If required columns are missing
|
|
"""
|
|
logger.info("Starting SP values comparison")
|
|
|
|
try:
|
|
# Validate input dataframes
|
|
if predelib_df is None or predelib_df.empty:
|
|
error_msg = "Predelib dataframe is None or empty"
|
|
logger.error(error_msg)
|
|
raise ValueError(error_msg)
|
|
|
|
if dashboard_df is None or dashboard_df.empty:
|
|
error_msg = "Dashboard dataframe is None or empty"
|
|
logger.error(error_msg)
|
|
raise ValueError(error_msg)
|
|
|
|
# Check for required columns
|
|
required_predelib_columns = ['ID', 'Totaal aantal SP']
|
|
required_dashboard_columns = ['ID', 'Ingeschr. SP (intern)']
|
|
|
|
missing_predelib_cols = [col for col in required_predelib_columns if col not in predelib_df.columns]
|
|
missing_dashboard_cols = [col for col in required_dashboard_columns if col not in dashboard_df.columns]
|
|
|
|
if missing_predelib_cols:
|
|
error_msg = f"Missing required columns in predelib dataframe: {missing_predelib_cols}"
|
|
logger.error(error_msg)
|
|
raise KeyError(error_msg)
|
|
|
|
if missing_dashboard_cols:
|
|
error_msg = f"Missing required columns in dashboard dataframe: {missing_dashboard_cols}"
|
|
logger.error(error_msg)
|
|
raise KeyError(error_msg)
|
|
|
|
logger.info("All required columns found in both dataframes")
|
|
|
|
# Debug ID columns
|
|
logger.debug(f"Predelib ID column type: {predelib_df['ID'].dtype}")
|
|
logger.debug(f"Dashboard ID column type: {dashboard_df['ID'].dtype}")
|
|
logger.debug(f"Sample predelib IDs: {list(predelib_df['ID'].head())}")
|
|
logger.debug(f"Sample dashboard IDs: {list(dashboard_df['ID'].head())}")
|
|
|
|
# Convert IDs to strings to ensure consistent comparison
|
|
try:
|
|
predelib_ids = set(str(x) for x in predelib_df['ID'] if pd.notna(x))
|
|
dashboard_ids = set(str(x) for x in dashboard_df['ID'] if pd.notna(x))
|
|
except Exception as e:
|
|
error_msg = f"Error converting IDs to strings: {e}"
|
|
logger.error(error_msg)
|
|
raise ValueError(error_msg)
|
|
|
|
matching_ids = predelib_ids.intersection(dashboard_ids)
|
|
logger.info(f"Found {len(matching_ids)} matching IDs between the two dataframes")
|
|
logger.info(f"Total predelib IDs: {len(predelib_ids)}")
|
|
logger.info(f"Total dashboard IDs: {len(dashboard_ids)}")
|
|
|
|
if len(matching_ids) == 0:
|
|
logger.warning("No matching IDs found between the dataframes")
|
|
return []
|
|
|
|
# Compare SP values for matching IDs
|
|
mismatches = []
|
|
processed_count = 0
|
|
|
|
for id_val in matching_ids:
|
|
try:
|
|
# Convert back to original type for filtering
|
|
predelib_matches = predelib_df[predelib_df['ID'].astype(str) == id_val]
|
|
dashboard_matches = dashboard_df[dashboard_df['ID'].astype(str) == id_val]
|
|
|
|
if len(predelib_matches) == 0:
|
|
logger.warning(f"No predelib records found for ID: {id_val}")
|
|
continue
|
|
|
|
if len(dashboard_matches) == 0:
|
|
logger.warning(f"No dashboard records found for ID: {id_val}")
|
|
continue
|
|
|
|
predelib_sp = predelib_matches['Totaal aantal SP'].iloc[0]
|
|
dashboard_sp = dashboard_matches['Ingeschr. SP (intern)'].iloc[0]
|
|
name_student = predelib_matches['Voornaam'].iloc[0] + ' ' + predelib_matches['Achternaam'].iloc[0]
|
|
|
|
# Handle potential NaN values
|
|
if pd.isna(predelib_sp) or pd.isna(dashboard_sp):
|
|
logger.warning(f"NaN values found for ID {id_val}: Predelib={predelib_sp}, Dashboard={dashboard_sp}")
|
|
continue
|
|
|
|
# Convert to comparable types
|
|
try:
|
|
predelib_sp_num = float(predelib_sp) if not pd.isna(predelib_sp) else 0
|
|
dashboard_sp_num = float(dashboard_sp) if not pd.isna(dashboard_sp) else 0
|
|
except (ValueError, TypeError) as e:
|
|
logger.warning(f"Error converting SP values to numbers for ID {id_val}: {e}")
|
|
# Fall back to string comparison
|
|
predelib_sp_num = str(predelib_sp)
|
|
dashboard_sp_num = str(dashboard_sp)
|
|
|
|
if predelib_sp_num != dashboard_sp_num:
|
|
mismatch = {
|
|
'ID': id_val,
|
|
'Name': name_student,
|
|
'Predelib_SP': predelib_sp,
|
|
'Dashboard_SP': dashboard_sp,
|
|
|
|
}
|
|
mismatches.append(mismatch)
|
|
logger.debug(f"Mismatch found for ID {id_val}: Predelib={predelib_sp}, Dashboard={dashboard_sp}")
|
|
|
|
processed_count += 1
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing ID {id_val}: {e}")
|
|
continue
|
|
|
|
logger.info(f"Successfully processed {processed_count} matching records")
|
|
|
|
if len(mismatches) == 0:
|
|
logger.info("All SP values match between the two dataframes!")
|
|
else:
|
|
logger.warning(f"Found {len(mismatches)} mismatches")
|
|
for mismatch in mismatches:
|
|
logger.info(f"Mismatch - ID {mismatch['ID']} ({mismatch['Name']}): Predeliberatierapport SP={mismatch['Predelib_SP']}, Dashboard Inschrijvingen SP={mismatch['Dashboard_SP']}")
|
|
|
|
return mismatches
|
|
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error in compare_sp_values: {e}")
|
|
raise
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Example usage - can be used for testing
|
|
logger.info("Starting SP comparison script")
|
|
|
|
try:
|
|
from checkheaders import check_headers_predelibfile, check_headers_dashboard_inschrijvingenfile
|
|
|
|
# Read the Excel files
|
|
logger.info("Reading Excel files")
|
|
try:
|
|
df_predelib = pd.read_excel('db.xlsx')
|
|
logger.info(f"Successfully loaded predelib file with shape: {df_predelib.shape}")
|
|
except FileNotFoundError:
|
|
logger.error("db.xlsx file not found")
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Error reading db.xlsx: {e}")
|
|
raise
|
|
|
|
try:
|
|
df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx')
|
|
logger.info(f"Successfully loaded dashboard file with shape: {df_dashboard.shape}")
|
|
except FileNotFoundError:
|
|
logger.error("dashboard_inschrijvingen.xlsx file not found")
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Error reading dashboard_inschrijvingen.xlsx: {e}")
|
|
raise
|
|
|
|
# Process the dataframes
|
|
logger.info("Processing dataframes")
|
|
try:
|
|
processed_predelib_df = check_headers_predelibfile(df_predelib)
|
|
logger.info(f"Processed predelib dataframe shape: {processed_predelib_df.shape}")
|
|
except Exception as e:
|
|
logger.error(f"Error processing predelib file: {e}")
|
|
raise
|
|
|
|
try:
|
|
processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard)
|
|
logger.info(f"Processed dashboard dataframe shape: {processed_dashboard_df.shape}")
|
|
except Exception as e:
|
|
logger.error(f"Error processing dashboard file: {e}")
|
|
raise
|
|
|
|
# Compare SP values between the two processed dataframes
|
|
logger.info("Starting SP values comparison")
|
|
try:
|
|
mismatches = compare_sp_values(processed_predelib_df, processed_dashboard_df)
|
|
logger.info(f"SP comparison completed successfully. Found {len(mismatches)} mismatches.")
|
|
|
|
# Print summary for console output
|
|
print(f"\n{'='*50}")
|
|
print("SP COMPARISON SUMMARY")
|
|
print(f"{'='*50}")
|
|
print(f"Predelib records processed: {len(processed_predelib_df)}")
|
|
print(f"Dashboard records processed: {len(processed_dashboard_df)}")
|
|
print(f"Mismatches found: {len(mismatches)}")
|
|
|
|
if mismatches:
|
|
print(f"\nDetailed mismatches:")
|
|
for mismatch in mismatches:
|
|
print(f" ID {mismatch['ID']}: Predelib={mismatch['Predelib_SP']}, Dashboard={mismatch['Dashboard_SP']}")
|
|
else:
|
|
print("\nAll SP values match perfectly!")
|
|
print(f"{'='*50}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during SP comparison: {e}")
|
|
raise
|
|
|
|
except ImportError as e:
|
|
logger.error(f"Import error: {e}")
|
|
print("Error: Could not import required modules. Make sure checkheaders.py is in the same directory.")
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error in main execution: {e}")
|
|
print(f"An error occurred: {e}")
|
|
print("Check the log file 'sp_comparison.log' for detailed error information.")
|
|
finally:
|
|
logger.info("SP comparison script completed")
|