import os from pathlib import Path import pandas as pd def find_duplicates(base_file: Path, ps_files: list[Path], id_col_candidates=None, grade_col_candidates=None): """Read the base registration file and several ps files, then find IDs that appear in both. Returns a DataFrame with columns: ID, Cijfer, SourceFile """ if id_col_candidates is None: id_col_candidates = ["ID", "Id", "id", "inschrijving_id"] if grade_col_candidates is None: grade_col_candidates = ["Cijfer", "cijfer", "Grade", "grade"] # Read base IDs print(f"Reading base file: {base_file}") base_df = pd.read_excel(base_file) # find ID column in base base_id_col = next((c for c in base_df.columns if c in id_col_candidates), None) if base_id_col is None: raise ValueError(f"Could not find an ID column in {base_file}. Tried: {id_col_candidates}") base_ids = set(base_df[base_id_col].dropna().astype(str).str.strip()) print(f"Found {len(base_ids)} IDs in base file (column '{base_id_col}').") duplicates = [] for pf in ps_files: print(f"Processing ps file: {pf}") try: df = pd.read_excel(pf) except Exception as e: print(f" Skipping {pf} - failed to read: {e}") continue # guess ID column id_col = next((c for c in df.columns if c in id_col_candidates), None) if id_col is None: # try fuzzy: column name contains 'id' id_col = next((c for c in df.columns if 'id' in str(c).lower()), None) if id_col is None: print(f" No ID column found in {pf}; skipping.") continue grade_col = next((c for c in df.columns if c in grade_col_candidates), None) if grade_col is None: # try fuzzy: column name contains 'cij' or 'cijfer' or 'grade' grade_col = next((c for c in df.columns if any(k in str(c).lower() for k in ['cij', 'grade'])), None) # normalize IDs to string df_ids = df[[id_col]].dropna() df_ids[id_col] = df_ids[id_col].astype(str).str.strip() # merge to find intersection mask = df_ids[id_col].isin(base_ids) matched = df.loc[mask] if matched.empty: print(f" No duplicates found in {pf}.") continue # collect results for _, row in matched.iterrows(): id_val = str(row[id_col]).strip() grade_val = row[grade_col] if (grade_col in row and pd.notna(row[grade_col])) else None duplicates.append({"ID": id_val, "Cijfer": grade_val, "SourceFile": pf.name}) print(f" Found {len(matched)} duplicates in {pf}.") dup_df = pd.DataFrame(duplicates) return dup_df def main(): base = Path(__file__).parent / "inschrijvingslijst sociologie.xlsx" # match files like: ps (82).xls.xlsx ps_files = sorted(Path(__file__).parent.glob('ps *.xls.xlsx')) if not base.exists(): print(f"Base file not found: {base}") return if not ps_files: print("No ps files found matching pattern 'ps (*.xls).xlsx'") return dup_df = find_duplicates(base, ps_files) if dup_df.empty: print("No duplicates found across provided files.") else: # print duplicates print("Duplicates found (ID - Cijfer - SourceFile):") for _, r in dup_df.iterrows(): print(f"{r['ID']} - {r['Cijfer']} - {r['SourceFile']}") out_csv = Path(__file__).parent / 'duplicates_summary.csv' dup_df.to_csv(out_csv, index=False) print(f"Wrote summary to {out_csv}") if __name__ == '__main__': main()