Add data processing scripts for exam data and comparison functionality

This commit is contained in:
bdaneels 2025-07-29 09:28:23 +02:00
parent 94ae88e756
commit eac9bad134
4 changed files with 204 additions and 0 deletions

View File

@ -0,0 +1,85 @@
import pandas as pd
def check_headers_predelibfile(df):
# Check if the headers are already in the column names (first row)
if 'Achternaam' in df.columns and 'Voornaam' in df.columns:
print("Headers found in first row - file already processed, returning unchanged")
return df # Return the dataframe unchanged
else:
# Find the row index where 'Achternaam' and 'Voornaam' appear as headers
header_row = None
for i, row in df.iterrows():
if 'Achternaam' in row.values and 'Voornaam' in row.values:
header_row = i
break
if header_row is not None:
# Delete all rows before the header row
df = df.iloc[header_row:].reset_index(drop=True)
# Set the first row as column headers
df.columns = df.iloc[0]
df = df.iloc[1:].reset_index(drop=True)
# Define the columns to keep
columns_to_keep = [
'ID', 'Achternaam', 'Voornaam', 'E-mail', 'Loopbaan',
'Drempelteller omschrijving', 'Programma status omschrijving',
'OO Periode', 'OO Studiegidsnummer', 'OO Lange omschrijving',
'OO Eenheden', 'OO Sessie', 'OO Credit (Y/N)', 'OO Periode credit',
'OO Programma code', 'OO Programma korte omschr.', 'Totaal aantal SP',
'Aantal SP vereist', 'Aantal SP zonder VZP', 'Adviesrapport code',
'Waarschuwing', 'Lijsttype'
]
# Keep only the specified columns (only if they exist in the dataframe)
existing_columns = [col for col in columns_to_keep if col in df.columns]
df = df[existing_columns]
print(f"Deleted {header_row} rows, set proper headers, and kept {len(existing_columns)} columns")
return df
else:
print("Headers 'Achternaam' and 'Voornaam' not found in the file")
return df
def check_headers_dashboard_inschrijvingenfile(df):
# Check if the headers are already in the column names (first row)
if 'Naam' in df.columns and 'Voornaam' in df.columns:
print("Headers found in first row of dashboard_inschrijvingen - no need to search for header row")
header_row = -1 # Indicates headers are already set
else:
# Find the row index where 'Naam' and 'Voornaam' appear as headers
header_row = None
for i, row in df.iterrows():
if 'Naam' in row.values and 'Voornaam' in row.values:
header_row = i
break
if header_row is not None:
# Delete all rows before the header row
df = df.iloc[header_row:].reset_index(drop=True)
# Set the first row as column headers
df.columns = df.iloc[0]
df = df.iloc[1:].reset_index(drop=True)
if header_row is not None and header_row >= 0:
print(f"Deleted {header_row} rows in dashboard_file, set proper headers")
elif header_row == -1:
print(f"Headers were already correct in dashboard_file.")
return df
else:
print("Headers 'Achternaam' and 'Voornaam' not found in the file")
return df
if __name__ == "__main__":
# Read the Excel files
df_predelib = pd.read_excel('db.xlsx')
df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx')
# Process the dataframes
processed_predelib_df = check_headers_predelibfile(df_predelib)
processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard)

View File

@ -0,0 +1,95 @@
import pandas as pd
def compare_sp_values(predelib_df, dashboard_df):
"""
Compare 'Totaal aantal SP' from predelib_df with 'Ingeschr. SP (intern)' from dashboard_df
for matching IDs between the two dataframes.
Args:
predelib_df (pandas.DataFrame): Dataframe from predeliberation file with 'ID' and 'Totaal aantal SP' columns
dashboard_df (pandas.DataFrame): Dataframe from dashboard file with 'ID' and 'Ingeschr. SP (intern)' columns
Returns:
list: List of dictionaries containing mismatches, or empty list if all match
"""
if 'ID' not in predelib_df.columns:
print("Warning: 'ID' column not found in predelib dataframe")
return []
if 'ID' not in dashboard_df.columns:
print("Warning: 'ID' column not found in dashboard dataframe")
return []
if 'Totaal aantal SP' not in predelib_df.columns:
print("Warning: 'Totaal aantal SP' column not found in predelib dataframe")
return []
if 'Ingeschr. SP (intern)' not in dashboard_df.columns:
print("Warning: 'Ingeschr. SP (intern)' column not found in dashboard dataframe")
return []
# Find matching IDs
# First, let's debug the ID columns
print(f"Predelib ID column type: {predelib_df['ID'].dtype}")
print(f"Dashboard ID column type: {dashboard_df['ID'].dtype}")
print(f"Sample predelib IDs: {list(predelib_df['ID'].head())}")
print(f"Sample dashboard IDs: {list(dashboard_df['ID'].head())}")
# Convert IDs to strings to ensure consistent comparison
predelib_ids = set(str(x) for x in predelib_df['ID'] if pd.notna(x))
dashboard_ids = set(str(x) for x in dashboard_df['ID'] if pd.notna(x))
matching_ids = predelib_ids.intersection(dashboard_ids)
print(f"Found {len(matching_ids)} matching IDs between the two dataframes")
if len(matching_ids) == 0:
print("No matching IDs found between the dataframes")
print(f"Total predelib IDs: {len(predelib_ids)}")
print(f"Total dashboard IDs: {len(dashboard_ids)}")
return []
# Compare SP values for matching IDs
mismatches = []
for id_val in matching_ids:
# Convert back to original type for filtering (try both string and original)
predelib_matches = predelib_df[predelib_df['ID'].astype(str) == id_val]
dashboard_matches = dashboard_df[dashboard_df['ID'].astype(str) == id_val]
if len(predelib_matches) == 0 or len(dashboard_matches) == 0:
continue
predelib_sp = predelib_matches['Totaal aantal SP'].iloc[0]
dashboard_sp = dashboard_matches['Ingeschr. SP (intern)'].iloc[0]
if predelib_sp != dashboard_sp:
mismatches.append({
'ID': id_val,
'Predelib_SP': predelib_sp,
'Dashboard_SP': dashboard_sp
})
if len(mismatches) == 0:
print("All SP values match between the two dataframes!")
else:
print(f"Found {len(mismatches)} mismatches:")
for mismatch in mismatches:
print(f" ID {mismatch['ID']}: Predelib={mismatch['Predelib_SP']}, Dashboard={mismatch['Dashboard_SP']}")
return mismatches
if __name__ == "__main__":
# Example usage - can be used for testing
from checkheaders import check_headers_predelibfile, check_headers_dashboard_inschrijvingenfile
# Read the Excel files
df_predelib = pd.read_excel('db.xlsx')
df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx')
# Process the dataframes
processed_predelib_df = check_headers_predelibfile(df_predelib)
processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard)
# Compare SP values between the two processed dataframes
print("\nComparing SP values between predelib and dashboard files:")
mismatches = compare_sp_values(processed_predelib_df, processed_dashboard_df)

View File

@ -0,0 +1,20 @@
import pandas as pd
from checkheaders import check_headers_dashboard_inschrijvingenfile, check_headers_predelibfile
from compare_sp import compare_sp_values
# Read the Excel file
df_predelib = pd.read_excel('db.xlsx')
df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx')
processed_predelib_df = check_headers_predelibfile(df_predelib)
processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard)
# Further processing can be done with processed_predelib_df and processed_dashboard_df
print("Processed Predelib DataFrame:")
print(processed_predelib_df)
print("\nProcessed Dashboard DataFrame:")
print(processed_dashboard_df)
compare_sp_values(processed_predelib_df, processed_dashboard_df)
print("\nComparison complete.")

4
startpakketten/todo.md Normal file
View File

@ -0,0 +1,4 @@
extracurriculaire vakken komen niet uit de wizard. aparte excel voor nodig, nl dashboard inschrijvingen.
Voor de check of ze extra keuzevakken opnemen en de vereiste dus hoger moet komen te staan: vgl de kollomen Totaal aantal SP Aantal SP vereist
deze kolom moet ook worden gechecked als ze eigenlijk minder opnemen; dus die moeten altijd aan elkaar gelijk zijn.