Add data processing scripts for exam data and comparison functionality
This commit is contained in:
parent
94ae88e756
commit
eac9bad134
85
startpakketten/checkheaders.py
Normal file
85
startpakketten/checkheaders.py
Normal file
|
@ -0,0 +1,85 @@
|
|||
import pandas as pd
|
||||
|
||||
def check_headers_predelibfile(df):
|
||||
# Check if the headers are already in the column names (first row)
|
||||
if 'Achternaam' in df.columns and 'Voornaam' in df.columns:
|
||||
print("Headers found in first row - file already processed, returning unchanged")
|
||||
return df # Return the dataframe unchanged
|
||||
else:
|
||||
# Find the row index where 'Achternaam' and 'Voornaam' appear as headers
|
||||
header_row = None
|
||||
for i, row in df.iterrows():
|
||||
if 'Achternaam' in row.values and 'Voornaam' in row.values:
|
||||
header_row = i
|
||||
break
|
||||
|
||||
if header_row is not None:
|
||||
# Delete all rows before the header row
|
||||
df = df.iloc[header_row:].reset_index(drop=True)
|
||||
|
||||
# Set the first row as column headers
|
||||
df.columns = df.iloc[0]
|
||||
df = df.iloc[1:].reset_index(drop=True)
|
||||
|
||||
# Define the columns to keep
|
||||
columns_to_keep = [
|
||||
'ID', 'Achternaam', 'Voornaam', 'E-mail', 'Loopbaan',
|
||||
'Drempelteller omschrijving', 'Programma status omschrijving',
|
||||
'OO Periode', 'OO Studiegidsnummer', 'OO Lange omschrijving',
|
||||
'OO Eenheden', 'OO Sessie', 'OO Credit (Y/N)', 'OO Periode credit',
|
||||
'OO Programma code', 'OO Programma korte omschr.', 'Totaal aantal SP',
|
||||
'Aantal SP vereist', 'Aantal SP zonder VZP', 'Adviesrapport code',
|
||||
'Waarschuwing', 'Lijsttype'
|
||||
]
|
||||
|
||||
# Keep only the specified columns (only if they exist in the dataframe)
|
||||
existing_columns = [col for col in columns_to_keep if col in df.columns]
|
||||
df = df[existing_columns]
|
||||
|
||||
print(f"Deleted {header_row} rows, set proper headers, and kept {len(existing_columns)} columns")
|
||||
return df
|
||||
else:
|
||||
print("Headers 'Achternaam' and 'Voornaam' not found in the file")
|
||||
return df
|
||||
|
||||
def check_headers_dashboard_inschrijvingenfile(df):
|
||||
# Check if the headers are already in the column names (first row)
|
||||
if 'Naam' in df.columns and 'Voornaam' in df.columns:
|
||||
print("Headers found in first row of dashboard_inschrijvingen - no need to search for header row")
|
||||
header_row = -1 # Indicates headers are already set
|
||||
else:
|
||||
# Find the row index where 'Naam' and 'Voornaam' appear as headers
|
||||
header_row = None
|
||||
for i, row in df.iterrows():
|
||||
if 'Naam' in row.values and 'Voornaam' in row.values:
|
||||
header_row = i
|
||||
break
|
||||
|
||||
if header_row is not None:
|
||||
# Delete all rows before the header row
|
||||
df = df.iloc[header_row:].reset_index(drop=True)
|
||||
|
||||
# Set the first row as column headers
|
||||
df.columns = df.iloc[0]
|
||||
df = df.iloc[1:].reset_index(drop=True)
|
||||
|
||||
if header_row is not None and header_row >= 0:
|
||||
print(f"Deleted {header_row} rows in dashboard_file, set proper headers")
|
||||
elif header_row == -1:
|
||||
print(f"Headers were already correct in dashboard_file.")
|
||||
|
||||
return df
|
||||
else:
|
||||
print("Headers 'Achternaam' and 'Voornaam' not found in the file")
|
||||
return df
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Read the Excel files
|
||||
df_predelib = pd.read_excel('db.xlsx')
|
||||
df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx')
|
||||
|
||||
# Process the dataframes
|
||||
processed_predelib_df = check_headers_predelibfile(df_predelib)
|
||||
processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard)
|
||||
|
95
startpakketten/compare_sp.py
Normal file
95
startpakketten/compare_sp.py
Normal file
|
@ -0,0 +1,95 @@
|
|||
import pandas as pd
|
||||
|
||||
def compare_sp_values(predelib_df, dashboard_df):
|
||||
"""
|
||||
Compare 'Totaal aantal SP' from predelib_df with 'Ingeschr. SP (intern)' from dashboard_df
|
||||
for matching IDs between the two dataframes.
|
||||
|
||||
Args:
|
||||
predelib_df (pandas.DataFrame): Dataframe from predeliberation file with 'ID' and 'Totaal aantal SP' columns
|
||||
dashboard_df (pandas.DataFrame): Dataframe from dashboard file with 'ID' and 'Ingeschr. SP (intern)' columns
|
||||
|
||||
Returns:
|
||||
list: List of dictionaries containing mismatches, or empty list if all match
|
||||
"""
|
||||
if 'ID' not in predelib_df.columns:
|
||||
print("Warning: 'ID' column not found in predelib dataframe")
|
||||
return []
|
||||
|
||||
if 'ID' not in dashboard_df.columns:
|
||||
print("Warning: 'ID' column not found in dashboard dataframe")
|
||||
return []
|
||||
|
||||
if 'Totaal aantal SP' not in predelib_df.columns:
|
||||
print("Warning: 'Totaal aantal SP' column not found in predelib dataframe")
|
||||
return []
|
||||
|
||||
if 'Ingeschr. SP (intern)' not in dashboard_df.columns:
|
||||
print("Warning: 'Ingeschr. SP (intern)' column not found in dashboard dataframe")
|
||||
return []
|
||||
|
||||
# Find matching IDs
|
||||
# First, let's debug the ID columns
|
||||
print(f"Predelib ID column type: {predelib_df['ID'].dtype}")
|
||||
print(f"Dashboard ID column type: {dashboard_df['ID'].dtype}")
|
||||
print(f"Sample predelib IDs: {list(predelib_df['ID'].head())}")
|
||||
print(f"Sample dashboard IDs: {list(dashboard_df['ID'].head())}")
|
||||
|
||||
# Convert IDs to strings to ensure consistent comparison
|
||||
predelib_ids = set(str(x) for x in predelib_df['ID'] if pd.notna(x))
|
||||
dashboard_ids = set(str(x) for x in dashboard_df['ID'] if pd.notna(x))
|
||||
|
||||
matching_ids = predelib_ids.intersection(dashboard_ids)
|
||||
print(f"Found {len(matching_ids)} matching IDs between the two dataframes")
|
||||
|
||||
if len(matching_ids) == 0:
|
||||
print("No matching IDs found between the dataframes")
|
||||
print(f"Total predelib IDs: {len(predelib_ids)}")
|
||||
print(f"Total dashboard IDs: {len(dashboard_ids)}")
|
||||
return []
|
||||
|
||||
# Compare SP values for matching IDs
|
||||
mismatches = []
|
||||
for id_val in matching_ids:
|
||||
# Convert back to original type for filtering (try both string and original)
|
||||
predelib_matches = predelib_df[predelib_df['ID'].astype(str) == id_val]
|
||||
dashboard_matches = dashboard_df[dashboard_df['ID'].astype(str) == id_val]
|
||||
|
||||
if len(predelib_matches) == 0 or len(dashboard_matches) == 0:
|
||||
continue
|
||||
|
||||
predelib_sp = predelib_matches['Totaal aantal SP'].iloc[0]
|
||||
dashboard_sp = dashboard_matches['Ingeschr. SP (intern)'].iloc[0]
|
||||
|
||||
if predelib_sp != dashboard_sp:
|
||||
mismatches.append({
|
||||
'ID': id_val,
|
||||
'Predelib_SP': predelib_sp,
|
||||
'Dashboard_SP': dashboard_sp
|
||||
})
|
||||
|
||||
if len(mismatches) == 0:
|
||||
print("All SP values match between the two dataframes!")
|
||||
else:
|
||||
print(f"Found {len(mismatches)} mismatches:")
|
||||
for mismatch in mismatches:
|
||||
print(f" ID {mismatch['ID']}: Predelib={mismatch['Predelib_SP']}, Dashboard={mismatch['Dashboard_SP']}")
|
||||
|
||||
return mismatches
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage - can be used for testing
|
||||
from checkheaders import check_headers_predelibfile, check_headers_dashboard_inschrijvingenfile
|
||||
|
||||
# Read the Excel files
|
||||
df_predelib = pd.read_excel('db.xlsx')
|
||||
df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx')
|
||||
|
||||
# Process the dataframes
|
||||
processed_predelib_df = check_headers_predelibfile(df_predelib)
|
||||
processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard)
|
||||
|
||||
# Compare SP values between the two processed dataframes
|
||||
print("\nComparing SP values between predelib and dashboard files:")
|
||||
mismatches = compare_sp_values(processed_predelib_df, processed_dashboard_df)
|
|
@ -0,0 +1,20 @@
|
|||
import pandas as pd
|
||||
|
||||
from checkheaders import check_headers_dashboard_inschrijvingenfile, check_headers_predelibfile
|
||||
from compare_sp import compare_sp_values
|
||||
|
||||
# Read the Excel file
|
||||
df_predelib = pd.read_excel('db.xlsx')
|
||||
df_dashboard = pd.read_excel('dashboard_inschrijvingen.xlsx')
|
||||
processed_predelib_df = check_headers_predelibfile(df_predelib)
|
||||
processed_dashboard_df = check_headers_dashboard_inschrijvingenfile(df_dashboard)
|
||||
|
||||
|
||||
# Further processing can be done with processed_predelib_df and processed_dashboard_df
|
||||
print("Processed Predelib DataFrame:")
|
||||
print(processed_predelib_df)
|
||||
print("\nProcessed Dashboard DataFrame:")
|
||||
print(processed_dashboard_df)
|
||||
|
||||
compare_sp_values(processed_predelib_df, processed_dashboard_df)
|
||||
print("\nComparison complete.")
|
4
startpakketten/todo.md
Normal file
4
startpakketten/todo.md
Normal file
|
@ -0,0 +1,4 @@
|
|||
extracurriculaire vakken komen niet uit de wizard. aparte excel voor nodig, nl dashboard inschrijvingen.
|
||||
Voor de check of ze extra keuzevakken opnemen en de vereiste dus hoger moet komen te staan: vgl de kollomen Totaal aantal SP Aantal SP vereist
|
||||
|
||||
deze kolom moet ook worden gechecked als ze eigenlijk minder opnemen; dus die moeten altijd aan elkaar gelijk zijn.
|
Loading…
Reference in New Issue
Block a user