Refactor code structure and remove redundant code blocks
This commit is contained in:
104
check sociologie inleiding soc/script.py
Normal file
104
check sociologie inleiding soc/script.py
Normal file
@@ -0,0 +1,104 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def find_duplicates(base_file: Path, ps_files: list[Path], id_col_candidates=None, grade_col_candidates=None):
|
||||
"""Read the base registration file and several ps files, then find IDs that appear in both.
|
||||
|
||||
Returns a DataFrame with columns: ID, Cijfer, SourceFile
|
||||
"""
|
||||
if id_col_candidates is None:
|
||||
id_col_candidates = ["ID", "Id", "id", "inschrijving_id"]
|
||||
if grade_col_candidates is None:
|
||||
grade_col_candidates = ["Cijfer", "cijfer", "Grade", "grade"]
|
||||
|
||||
# Read base IDs
|
||||
print(f"Reading base file: {base_file}")
|
||||
base_df = pd.read_excel(base_file)
|
||||
|
||||
# find ID column in base
|
||||
base_id_col = next((c for c in base_df.columns if c in id_col_candidates), None)
|
||||
if base_id_col is None:
|
||||
raise ValueError(f"Could not find an ID column in {base_file}. Tried: {id_col_candidates}")
|
||||
|
||||
base_ids = set(base_df[base_id_col].dropna().astype(str).str.strip())
|
||||
print(f"Found {len(base_ids)} IDs in base file (column '{base_id_col}').")
|
||||
|
||||
duplicates = []
|
||||
|
||||
for pf in ps_files:
|
||||
print(f"Processing ps file: {pf}")
|
||||
try:
|
||||
df = pd.read_excel(pf)
|
||||
except Exception as e:
|
||||
print(f" Skipping {pf} - failed to read: {e}")
|
||||
continue
|
||||
|
||||
# guess ID column
|
||||
id_col = next((c for c in df.columns if c in id_col_candidates), None)
|
||||
if id_col is None:
|
||||
# try fuzzy: column name contains 'id'
|
||||
id_col = next((c for c in df.columns if 'id' in str(c).lower()), None)
|
||||
if id_col is None:
|
||||
print(f" No ID column found in {pf}; skipping.")
|
||||
continue
|
||||
|
||||
grade_col = next((c for c in df.columns if c in grade_col_candidates), None)
|
||||
if grade_col is None:
|
||||
# try fuzzy: column name contains 'cij' or 'cijfer' or 'grade'
|
||||
grade_col = next((c for c in df.columns if any(k in str(c).lower() for k in ['cij', 'grade'])), None)
|
||||
|
||||
# normalize IDs to string
|
||||
df_ids = df[[id_col]].dropna()
|
||||
df_ids[id_col] = df_ids[id_col].astype(str).str.strip()
|
||||
|
||||
# merge to find intersection
|
||||
mask = df_ids[id_col].isin(base_ids)
|
||||
matched = df.loc[mask]
|
||||
if matched.empty:
|
||||
print(f" No duplicates found in {pf}.")
|
||||
continue
|
||||
|
||||
# collect results
|
||||
for _, row in matched.iterrows():
|
||||
id_val = str(row[id_col]).strip()
|
||||
grade_val = row[grade_col] if (grade_col in row and pd.notna(row[grade_col])) else None
|
||||
duplicates.append({"ID": id_val, "Cijfer": grade_val, "SourceFile": pf.name})
|
||||
|
||||
print(f" Found {len(matched)} duplicates in {pf}.")
|
||||
|
||||
dup_df = pd.DataFrame(duplicates)
|
||||
return dup_df
|
||||
|
||||
|
||||
def main():
|
||||
base = Path(__file__).parent / "inschrijvingslijst sociologie.xlsx"
|
||||
# match files like: ps (82).xls.xlsx
|
||||
ps_files = sorted(Path(__file__).parent.glob('ps *.xls.xlsx'))
|
||||
|
||||
if not base.exists():
|
||||
print(f"Base file not found: {base}")
|
||||
return
|
||||
|
||||
if not ps_files:
|
||||
print("No ps files found matching pattern 'ps (*.xls).xlsx'")
|
||||
return
|
||||
|
||||
dup_df = find_duplicates(base, ps_files)
|
||||
|
||||
if dup_df.empty:
|
||||
print("No duplicates found across provided files.")
|
||||
else:
|
||||
# print duplicates
|
||||
print("Duplicates found (ID - Cijfer - SourceFile):")
|
||||
for _, r in dup_df.iterrows():
|
||||
print(f"{r['ID']} - {r['Cijfer']} - {r['SourceFile']}")
|
||||
|
||||
out_csv = Path(__file__).parent / 'duplicates_summary.csv'
|
||||
dup_df.to_csv(out_csv, index=False)
|
||||
print(f"Wrote summary to {out_csv}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user