import os import sys import pandas as pd import numpy as np SCRIPT_DIR = os.path.dirname(__file__) SCRIPTIE_XLSX = os.path.join(SCRIPT_DIR, "scriptie.xlsx") DASH_XLSX = os.path.join(SCRIPT_DIR, "dashboard_inschrijvingen.xlsx") def find_common_id_col(df1, df2): common = set(df1.columns).intersection(df2.columns) # prefer obvious id-like names for kw in ("ID"): for c in common: if kw in c.lower(): return c if len(common) == 1: return next(iter(common)) if common: return sorted(common)[0] raise ValueError(f"No common column found between files.\nFile1 cols: {df1.columns.tolist()}\nFile2 cols: {df2.columns.tolist()}") def find_col_by_keywords(df, keywords): for k in keywords: for c in df.columns: if k in c.lower(): return c return None def main(): if not os.path.exists(SCRIPTIE_XLSX) or not os.path.exists(DASH_XLSX): print("Missing files. Make sure scriptie.xlsx and dashboard_inschrijvingen.xlsx are in the script folder.") sys.exit(1) df_scriptie = pd.read_excel(SCRIPTIE_XLSX) df_dash = pd.read_excel(DASH_XLSX) try: id_col = find_common_id_col(df_scriptie, df_dash) except ValueError as e: print(e) sys.exit(1) # find relevant columns (best-effort) col_target_oo = find_col_by_keywords(df_scriptie, ["target oo", "target_oo", "targetoo", "target oo"]) col_target_plan = find_col_by_keywords(df_scriptie, ["target plan", "target_plan", "targetplan", "target"]) col_sp = find_col_by_keywords(df_dash, ["sp", "punten", "ects", "study points"]) if col_target_oo is None or col_target_plan is None or col_sp is None: print("Could not locate required columns. Detected:") print("scriptie columns:", df_scriptie.columns.tolist()) print("dashboard columns:", df_dash.columns.tolist()) print(f"Found -> target_oo: {col_target_oo}, target_plan: {col_target_plan}, SP: {col_sp}") sys.exit(1) merged = df_scriptie.merge(df_dash, on=id_col, how="inner", suffixes=("_scriptie", "_dash")) # filter rows where Target OO contains 1070FLWGES mask_oo = merged[col_target_oo].astype(str).str.contains("1070FLWGES", na=False) subset = merged[mask_oo].copy() if subset.empty: print("No rows with Target OO containing '1070FLWGES'. No violations.") return # determine thresholds per row: default 180-9 = 171, if Target plan contains 'Ba geschiedenis (major)' then 180-12 = 168 plan_contains = subset[col_target_plan].astype(str).str.contains("ba geschiedenis (major)", case=False, na=False) subset["threshold"] = np.where(plan_contains, 180 - 12, 180 - 9) # coerce SP to numeric (non-numeric become NaN) subset["SP_value"] = pd.to_numeric(subset[col_sp], errors="coerce") # violation: SP is NaN or SP < threshold violations = subset[subset["SP_value"] < subset["threshold"]] # also consider NaN as violation nan_viol = subset[subset["SP_value"].isna()] violations = pd.concat([violations, nan_viol]).drop_duplicates() if violations.empty: print("No violations found for entries with Target OO == 1070FLWGES.") return report_cols = [id_col, col_sp, "SP_value", "threshold", col_target_plan, col_target_oo] report = violations.loc[:, [id_col, col_sp, "SP_value", "threshold", col_target_plan, col_target_oo]] report = report.rename(columns={col_sp: "SP_raw", col_target_plan: "Target_plan", col_target_oo: "Target_OO"}) out_csv = os.path.join(SCRIPT_DIR, "violations_report.csv") out_xlsx = os.path.join(SCRIPT_DIR, "violations_report.xlsx") report.to_csv(out_csv, index=False) try: report.to_excel(out_xlsx, index=False) except Exception: pass print(f"Found {len(report)} violation(s). Saved to: {out_csv} (and {out_xlsx} if Excel write succeeded).") print(report.to_string(index=False)) if __name__ == "__main__": main()