Refactor code structure and remove redundant code blocks

This commit is contained in:
bdaneels 2025-10-21 14:20:04 +02:00
parent bcf8f3acae
commit 468df81386
10 changed files with 2054 additions and 19 deletions

View File

@ -0,0 +1,104 @@
import os
from pathlib import Path
import pandas as pd
def find_duplicates(base_file: Path, ps_files: list[Path], id_col_candidates=None, grade_col_candidates=None):
"""Read the base registration file and several ps files, then find IDs that appear in both.
Returns a DataFrame with columns: ID, Cijfer, SourceFile
"""
if id_col_candidates is None:
id_col_candidates = ["ID", "Id", "id", "inschrijving_id"]
if grade_col_candidates is None:
grade_col_candidates = ["Cijfer", "cijfer", "Grade", "grade"]
# Read base IDs
print(f"Reading base file: {base_file}")
base_df = pd.read_excel(base_file)
# find ID column in base
base_id_col = next((c for c in base_df.columns if c in id_col_candidates), None)
if base_id_col is None:
raise ValueError(f"Could not find an ID column in {base_file}. Tried: {id_col_candidates}")
base_ids = set(base_df[base_id_col].dropna().astype(str).str.strip())
print(f"Found {len(base_ids)} IDs in base file (column '{base_id_col}').")
duplicates = []
for pf in ps_files:
print(f"Processing ps file: {pf}")
try:
df = pd.read_excel(pf)
except Exception as e:
print(f" Skipping {pf} - failed to read: {e}")
continue
# guess ID column
id_col = next((c for c in df.columns if c in id_col_candidates), None)
if id_col is None:
# try fuzzy: column name contains 'id'
id_col = next((c for c in df.columns if 'id' in str(c).lower()), None)
if id_col is None:
print(f" No ID column found in {pf}; skipping.")
continue
grade_col = next((c for c in df.columns if c in grade_col_candidates), None)
if grade_col is None:
# try fuzzy: column name contains 'cij' or 'cijfer' or 'grade'
grade_col = next((c for c in df.columns if any(k in str(c).lower() for k in ['cij', 'grade'])), None)
# normalize IDs to string
df_ids = df[[id_col]].dropna()
df_ids[id_col] = df_ids[id_col].astype(str).str.strip()
# merge to find intersection
mask = df_ids[id_col].isin(base_ids)
matched = df.loc[mask]
if matched.empty:
print(f" No duplicates found in {pf}.")
continue
# collect results
for _, row in matched.iterrows():
id_val = str(row[id_col]).strip()
grade_val = row[grade_col] if (grade_col in row and pd.notna(row[grade_col])) else None
duplicates.append({"ID": id_val, "Cijfer": grade_val, "SourceFile": pf.name})
print(f" Found {len(matched)} duplicates in {pf}.")
dup_df = pd.DataFrame(duplicates)
return dup_df
def main():
base = Path(__file__).parent / "inschrijvingslijst sociologie.xlsx"
# match files like: ps (82).xls.xlsx
ps_files = sorted(Path(__file__).parent.glob('ps *.xls.xlsx'))
if not base.exists():
print(f"Base file not found: {base}")
return
if not ps_files:
print("No ps files found matching pattern 'ps (*.xls).xlsx'")
return
dup_df = find_duplicates(base, ps_files)
if dup_df.empty:
print("No duplicates found across provided files.")
else:
# print duplicates
print("Duplicates found (ID - Cijfer - SourceFile):")
for _, r in dup_df.iterrows():
print(f"{r['ID']} - {r['Cijfer']} - {r['SourceFile']}")
out_csv = Path(__file__).parent / 'duplicates_summary.csv'
dup_df.to_csv(out_csv, index=False)
print(f"Wrote summary to {out_csv}")
if __name__ == '__main__':
main()

View File

@ -0,0 +1,4 @@
/Mentoraat_2024-2025.xlsx
/reinoud.xlsx
/sisa.xlsx
*.xlsx

View File

@ -0,0 +1,90 @@
# Script Documentation
## Overview
This script processes two Excel files (
reinoud.xlsx
and
sisa.xlsx
) to find and append missing IDs from
sisa.xlsx
to
reinoud.xlsx
. It also checks for duplicate IDs in
reinoud.xlsx
.
## Functions
### [`load_excel(file_path: str, sheet_name: Optional[str] = None) -> pd.DataFrame`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A7%2C%22character%22%3A4%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition")
Loads an Excel file into a DataFrame.
### [`check_duplicates(df: pd.DataFrame, column: str) -> List[str]`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A21%2C%22character%22%3A4%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition")
Checks for duplicate values in a specified column.
### [`find_missing_ids(df1: pd.DataFrame, df2: pd.DataFrame, column: str) -> List[str]`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A26%2C%22character%22%3A4%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition")
Finds IDs in [`df2`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A26%2C%22character%22%3A40%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition") that are not in [`df1`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A26%2C%22character%22%3A21%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition").
### [`append_missing_ids(reinoud_df: pd.DataFrame, sisa_df: pd.DataFrame, column: str, reinoud_file: str) -> pd.DataFrame`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A33%2C%22character%22%3A4%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition")
Appends missing IDs and corresponding details from [`sisa_df`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A33%2C%22character%22%3A49%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition") to [`reinoud_df`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A33%2C%22character%22%3A23%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition").
### [`main(reinoud_file: str, sisa_file: str, column: str, reinoud_sheet: Optional[str] = None, sisa_sheet: Optional[str] = None)`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A55%2C%22character%22%3A4%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition")
Main function to load the Excel files, check for duplicates, append missing IDs, and save the updated DataFrame back to the Excel file.
## Usage
Run the script with the following command:
```sh
python script.py
```
Example usage within the script:
```python
if __name__ == "__main__":
main('reinoud.xlsx', 'sisa.xlsx', 'Rolnummer', reinoud_sheet='Actief', sisa_sheet='sheet1')
```
## Logging
The script uses the [`logging`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A1%2C%22character%22%3A7%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition") module to log information and errors. The log level is set to [`INFO`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A5%2C%22character%22%3A34%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition").
## File Structure
```
.gitignore
reinoud.xlsx
script.py
sisa.xlsx
```
## Dependencies
- pandas
- logging
Install dependencies using:
```sh
pip install pandas
```
## License
This script is provided "as-is" without any warranty. Use at your own risk.

View File

@ -0,0 +1,82 @@
import pandas as pd
import logging
from typing import List, Optional
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
def load_excel(file_path: str, sheet_name: Optional[str] = None) -> pd.DataFrame:
"""Load an Excel file into a DataFrame."""
try:
df = pd.read_excel(file_path, sheet_name=sheet_name)
if isinstance(df, dict):
raise ValueError(f"Multiple sheets found in {file_path}. Please specify a sheet name.")
return df
except FileNotFoundError:
logging.error(f"File not found: {file_path}")
raise
except Exception as e:
logging.error(f"Error loading file {file_path}: {e}")
raise
def check_duplicates(df: pd.DataFrame, column: str) -> List[str]:
"""Check for duplicate values in a specified column."""
duplicates = df[column].astype(str)[df[column].duplicated()]
return duplicates.tolist()
def find_missing_ids(df1: pd.DataFrame, df2: pd.DataFrame, column: str) -> List[str]:
"""Find IDs in df2 that are not in df1."""
ids1 = df1[column].astype(str)
ids2 = df2[column].astype(str)
missing_ids = ids2[~ids2.isin(ids1)]
return missing_ids.tolist()
def append_missing_ids(reinoud_df: pd.DataFrame, sisa_df: pd.DataFrame, column: str, reinoud_file: str) -> pd.DataFrame:
"""Append missing IDs and corresponding Naam, Voornaam, Plan, and Campus emailadres to reinoud_df."""
missing_ids = find_missing_ids(reinoud_df, sisa_df, column)
if missing_ids:
missing_rows = sisa_df[sisa_df[column].astype(str).isin(missing_ids)]
# Select only the specified columns
selected_columns = ['Rolnummer', 'Naam', 'Voornaam', 'Plan', 'Campus emailadres']
missing_rows = missing_rows[selected_columns]
# Rename 'Campus emailadres' to 'mail' for reinoud_df
missing_rows = missing_rows.rename(columns={'Campus emailadres': 'mail'})
# Append missing rows to reinoud_df
reinoud_df = pd.concat([reinoud_df, missing_rows], ignore_index=True)
logging.info(f"Appended missing IDs to {reinoud_file}:")
for _, row in missing_rows.iterrows():
logging.info(f"ID: {row[column]}, Naam: {row['Naam']}, Voornaam: {row['Voornaam']}, Plan: {row['Plan']}, mail: {row['mail']}")
else:
logging.info("No missing IDs to append.")
return reinoud_df
def main(reinoud_file: str, sisa_file: str, column: str, reinoud_sheet: Optional[str] = None, sisa_sheet: Optional[str] = None):
# Load the Excel files
reinoud_df = load_excel(reinoud_file, sheet_name=reinoud_sheet)
sisa_df = load_excel(sisa_file, sheet_name=sisa_sheet)
# Debug: Print columns of sisa_df
logging.info(f"Columns in {sisa_file}: {sisa_df.columns.tolist()}")
# Check for duplicates in reinoud
duplicates = check_duplicates(reinoud_df, column)
if duplicates:
logging.info("Duplicate IDs in reinoud.xlsx:")
logging.info(duplicates)
else:
logging.info("No duplicates found in reinoud.xlsx.")
# Append missing IDs from sisa to reinoud
reinoud_df = append_missing_ids(reinoud_df, sisa_df, column, reinoud_file)
# Save the updated reinoud_df back to the Excel file
reinoud_df.to_excel(reinoud_file, sheet_name=reinoud_sheet, index=False)
logging.info(f"Updated {reinoud_file} saved.")
if __name__ == "__main__":
# Example usage
# change the file names, column name, and sheet names as needed
main('reinoud.xlsx', 'sisa.xlsx', 'Rolnummer', reinoud_sheet='Actief', sisa_sheet='sheet1')

View File

@ -0,0 +1,326 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Update 'BAGES' sheet in 'ongeloofelijken tool.xlsx' with the latest (2025-2026) bachelor History
study programme from UAntwerpen. It scrapes the official page and writes a normalized table.
Source page (2025-2026 bachelor study programme):
https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/
- In 2025-2026 the 'Geschiedenis per periode en gebied' structure changed to a two-pillar model:
* Chronologische pijler: 3 OOs (middeleeuwen, nieuwe tijd, nieuwste tijd)
* Thematische pijler: 2 OOs
(See faculty helpdesk note with change summary and transition measures.)
IMPORTANT:
- This script only updates the 'BAGES' (Bachelor) sheet, because the provided link covers the bachelor page.
- 'MAGES' and 'SPVP' sheets remain untouched.
Tested with: requests, beautifulsoup4, lxml, pandas, openpyxl
"""
import re
import sys
import time
import urllib.parse
from datetime import datetime
from pathlib import Path
from typing import Optional, Tuple
import requests
import pandas as pd
from bs4 import BeautifulSoup
from openpyxl import load_workbook
from openpyxl.utils.exceptions import InvalidFileException
# ------------------------- Configuration -------------------------
EXCEL_PATH = "ongeloofelijken tool.xlsx"
TARGET_SHEET = "BAGES"
ARCHIVE_PREFIX = "BAGES_OLD_"
UA_BA_URL = "https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/"
TARGET_YEAR_PREFIX = "2025-" # Anchor/course URLs have '?id=<year>-<code>'; we filter with '2025-'
TIMEOUT = 30
HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; uantwerpen-bages-updater/1.0; +https://www.uantwerpen.be/)",
"Accept-Language": "nl,en;q=0.8"
}
# ------------------------- Helpers -------------------------
def fetch_html(url: str) -> BeautifulSoup:
resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
resp.raise_for_status()
return BeautifulSoup(resp.text, "lxml")
def extract_text(el) -> str:
return re.sub(r"\s+", " ", " ".join(el.stripped_strings)) if el else ""
def parse_meta_from_block(block_text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
"""
Try to parse Semester, Credits, Lecturers, Notes from a block of text next to a course link.
Returns (semester, credits, lecturers, notes)
"""
text = block_text
# Semester examples: '1E SEM', '2E SEM', '1E/2E SEM'
sem = None
m_sem = re.search(r"\b(1E\s*/\s*2E\s*SEM|1E\s*SEM|2E\s*SEM)\b", text, flags=re.I)
if m_sem:
sem = m_sem.group(1).upper().replace(" ", "")
# Credits examples: '6 studiepunten', '3 studiepunten'
credits = None
m_sp = re.search(r"(\d+)\s*studiepunten", text, flags=re.I)
if m_sp:
credits = m_sp.group(1)
# Lecturers: after 'Lesgever (s):' or 'Lesgever(s):'
lecturers = None
m_lect = re.search(r"Lesgever\s*\(s\)\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
if not m_lect:
m_lect = re.search(r"Lesgever[s]?\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
if m_lect:
lecturers = m_lect.group(1).strip(" .").replace(" ,", ",")
# Notes: look for two-yearly etc.
notes = None
if re.search(r"Tweejaarlijks", text, flags=re.I):
# Try to capture the "even/oneven" phrasing
m_ev = re.search(r"tweejaarlijks[^.]*?(even|oneven)[^.]*jaar", text, flags=re.I)
notes = "Tweejaarlijks" + (f" ({m_ev.group(1).lower()} jaar)" if m_ev else "")
return sem, credits, lecturers, notes
def nearest_sections(a_tag) -> Tuple[Optional[str], Optional[str], Optional[str]]:
"""
Find nearest preceding headings to classify the row.
Returns (section, subsection, pillar)
- section: e.g., 'Modeltraject deel 1/2/3'
- subsection: e.g., 'Wijsbegeerte en sociale wetenschappen', 'Keuzeopleidingsonderdelen', 'Inleiding tot de geschiedenis', etc.
- pillar: for deel 3: 'Chronologische pijler', 'Thematische pijler' or None
"""
# The site uses a variety of headings (h2, h3, h4); we trace back to find labels
h = a_tag.find_previous(["h2", "h3", "h4", "h5"])
section = subsection = pillar = None
# Walk up multiple previous headings to capture a hierarchy
prev_heads = []
cur = a_tag
for _ in range(40): # limit walk to avoid infinite loops
cur = cur.find_previous(["h2", "h3", "h4", "h5"])
if not cur:
break
txt = extract_text(cur)
prev_heads.append(txt)
# Determine labels from the nearest few headings
for txt in prev_heads:
t = txt.lower()
if section is None and "modeltraject deel" in t:
# Normalize like "Modeltraject deel 1"
section = txt
if subsection is None:
# Typical subsections
if any(k in t for k in [
"wijsbegeerte en sociale wetenschappen",
"methodologie van de geschiedenis",
"historische oefeningen",
"inleiding tot de geschiedenis",
"heuristiek",
"historisch overzicht",
"keuzeopleidingsonderdelen",
"sociale wetenschappen",
]):
subsection = txt
if pillar is None and ("chronologische pijler" in t or "thematische pijler" in t):
pillar = txt
if section and (subsection or pillar):
# Good enough
break
return section, subsection, pillar
def parse_courses_from_page(soup: BeautifulSoup) -> pd.DataFrame:
"""
Parse all course links for the 2025-xxxx academic year, infer metadata from nearby text,
and return a normalized DataFrame.
"""
rows = []
# Capture all anchors that look like course links containing '?id=2025-<CODE>'
for a in soup.find_all("a", href=True):
href = a["href"]
# Normalize relative links
full_url = urllib.parse.urljoin(UA_BA_URL, href)
# Filter by the 'id=2025-' parameter (2025-2026)
if "id=" in href:
q = urllib.parse.urlparse(href).query
params = urllib.parse.parse_qs(q)
ids = params.get("id", [])
if not ids:
continue
# Some pages use '2025-XXXXX' or '2025-XXXXX&lang=nl'
if not any(idv.startswith(TARGET_YEAR_PREFIX) for idv in ids):
continue
course_id = ids[0] # e.g., '2025-1002FLWGES'
else:
# No id=... parameter; skip
continue
# Extract code after '2025-'
code = None
m = re.match(r"2025-([A-Za-z0-9]+)", course_id)
if m:
code = m.group(1)
name = extract_text(a).strip()
if not name or not code:
continue
# Use a reasonably large ancestor block for metadata search
container = a
for _ in range(4):
if container.parent:
container = container.parent
block_text = extract_text(container)
semester, credits, lecturers, notes = parse_meta_from_block(block_text)
section, subsection, pillar = nearest_sections(a)
rows.append({
"Section": section,
"Subsection": subsection,
"Pillar": pillar,
"Course Code": code,
"Course Name": name,
"URL": full_url,
"Semester": semester,
"Credits": credits,
"Lecturers": lecturers,
"Notes": notes
})
df = pd.DataFrame(rows).drop_duplicates(subset=["Course Code", "Course Name"])
# Keep only rows that clearly belong to the 'Bachelor' page; sometimes cross-links appear
# Heuristic: we keep rows with a Section that starts with "Modeltraject deel" or that have a Pillar marker
mask = (
df["Section"].fillna("").str.contains(r"Modeltraject deel", case=False) |
df["Pillar"].fillna("").str.contains(r"Pijler", case=False)
)
df = df[mask].copy()
# Clean up text for consistency
def clean_col(s):
return s.str.replace(r"\s+", " ", regex=True).str.strip()
for col in ["Section", "Subsection", "Pillar", "Course Name", "Lecturers", "Notes"]:
df[col] = clean_col(df[col].astype(str))
# Ensure missing pillar/subsection are empty strings for consistent sorting
df["Pillar"] = df["Pillar"].fillna("")
df["Subsection"] = df["Subsection"].fillna("")
# Sort for readability: section → pillar → subsection → name
df.sort_values(
by=["Section", "Pillar", "Subsection", "Course Name"],
inplace=True
)
df.reset_index(drop=True, inplace=True)
return df
def archive_and_write(excel_path: str, df: pd.DataFrame, target_sheet: str):
"""
- If sheet 'BAGES' exists, rename it to 'BAGES_OLD_YYYYMMDD'
- Write df to 'BAGES'
"""
try:
wb = load_workbook(excel_path)
except FileNotFoundError:
print(f"[INFO] File not found, creating new workbook: {excel_path}")
# Write a new file straight away
with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
df.to_excel(writer, sheet_name=target_sheet, index=False)
return
except InvalidFileException:
print(f"[ERROR] Not a valid Excel file: {excel_path}")
sys.exit(1)
# Rename existing BAGES to archive
if target_sheet in wb.sheetnames:
date_suffix = datetime.now().strftime("%Y%m%d")
archive_name = ARCHIVE_PREFIX + date_suffix
# Ensure uniqueness (append a counter if necessary)
counter = 1
final_archive = archive_name
while final_archive in wb.sheetnames:
counter += 1
final_archive = f"{archive_name}_{counter}"
ws = wb[target_sheet]
ws.title = final_archive
print(f"[INFO] Archived existing '{target_sheet}' as '{final_archive}'")
# Save intermediate
wb.save(excel_path)
# Now write the new sheet
with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
df.to_excel(writer, sheet_name=target_sheet, index=False)
print(f"[SUCCESS] Wrote updated '{target_sheet}' sheet to '{excel_path}'")
# Also save the updated DataFrame to a separate new Excel file for convenience
try:
src = Path(excel_path)
new_name = src.with_name(f"{src.stem}_updated{src.suffix}")
# Write a fresh workbook containing only the updated sheet
with pd.ExcelWriter(str(new_name), engine="openpyxl") as writer:
df.to_excel(writer, sheet_name=target_sheet, index=False)
print(f"[INFO] Also wrote updated data to new file '{new_name}'")
except Exception as e:
print(f"[WARNING] Could not write updated copy to new file: {e}")
def main():
print("[STEP 1] Fetching the bachelor study programme page (2025-2026)…")
soup = fetch_html(UA_BA_URL)
time.sleep(0.3)
print("[STEP 2] Parsing courses and metadata (this may take a few seconds)…")
df = parse_courses_from_page(soup)
if df.empty:
print("[WARNING] No 2025-xxxx course rows found. The page structure may have changed.")
print(" Please open the URL in a browser and check if '2025-2026' content is visible.")
else:
# Sanity: flag pillar rows (deel 3) visibly
df["Pillar"] = df["Pillar"].replace({"": None})
print(f"[INFO] Parsed {len(df)} course rows for 2025-2026.")
# Optional: give you a quick view in console
head = df.head(10).to_string(index=False)
print("[PREVIEW]\n" + head)
print(f"[STEP 3] Updating Excel: {EXCEL_PATH}")
archive_and_write(EXCEL_PATH, df, TARGET_SHEET)
print("\nDone. You can now open the workbook and review the refreshed 'BAGES' sheet.")
if __name__ == "__main__":
main()
# Try to save a copy of the corrected script beside the original.
def save_copy(dest_name: str = "script_fixed.py"):
try:
import pathlib
src = pathlib.Path(__file__)
dst = src.with_name(dest_name)
dst.write_text(src.read_text(encoding="utf-8"), encoding="utf-8")
print(f"[INFO] Wrote a copy of this script to '{dst}'")
except Exception as e:
print(f"[WARNING] Could not write copy: {e}")
save_copy()

View File

@ -0,0 +1,326 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Update 'BAGES' sheet in 'ongeloofelijken tool.xlsx' with the latest (2025-2026) bachelor History
study programme from UAntwerpen. It scrapes the official page and writes a normalized table.
Source page (2025-2026 bachelor study programme):
https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/
- In 2025-2026 the 'Geschiedenis per periode en gebied' structure changed to a two-pillar model:
* Chronologische pijler: 3 OOs (middeleeuwen, nieuwe tijd, nieuwste tijd)
* Thematische pijler: 2 OOs
(See faculty helpdesk note with change summary and transition measures.)
IMPORTANT:
- This script only updates the 'BAGES' (Bachelor) sheet, because the provided link covers the bachelor page.
- 'MAGES' and 'SPVP' sheets remain untouched.
Tested with: requests, beautifulsoup4, lxml, pandas, openpyxl
"""
import re
import sys
import time
import urllib.parse
from datetime import datetime
from pathlib import Path
from typing import Optional, Tuple
import requests
import pandas as pd
from bs4 import BeautifulSoup
from openpyxl import load_workbook
from openpyxl.utils.exceptions import InvalidFileException
# ------------------------- Configuration -------------------------
EXCEL_PATH = "ongeloofelijken tool.xlsx"
TARGET_SHEET = "BAGES"
ARCHIVE_PREFIX = "BAGES_OLD_"
UA_BA_URL = "https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/"
TARGET_YEAR_PREFIX = "2025-" # Anchor/course URLs have '?id=<year>-<code>'; we filter with '2025-'
TIMEOUT = 30
HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; uantwerpen-bages-updater/1.0; +https://www.uantwerpen.be/)",
"Accept-Language": "nl,en;q=0.8"
}
# ------------------------- Helpers -------------------------
def fetch_html(url: str) -> BeautifulSoup:
resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
resp.raise_for_status()
return BeautifulSoup(resp.text, "lxml")
def extract_text(el) -> str:
return re.sub(r"\s+", " ", " ".join(el.stripped_strings)) if el else ""
def parse_meta_from_block(block_text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
"""
Try to parse Semester, Credits, Lecturers, Notes from a block of text next to a course link.
Returns (semester, credits, lecturers, notes)
"""
text = block_text
# Semester examples: '1E SEM', '2E SEM', '1E/2E SEM'
sem = None
m_sem = re.search(r"\b(1E\s*/\s*2E\s*SEM|1E\s*SEM|2E\s*SEM)\b", text, flags=re.I)
if m_sem:
sem = m_sem.group(1).upper().replace(" ", "")
# Credits examples: '6 studiepunten', '3 studiepunten'
credits = None
m_sp = re.search(r"(\d+)\s*studiepunten", text, flags=re.I)
if m_sp:
credits = m_sp.group(1)
# Lecturers: after 'Lesgever (s):' or 'Lesgever(s):'
lecturers = None
m_lect = re.search(r"Lesgever\s*\(s\)\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
if not m_lect:
m_lect = re.search(r"Lesgever[s]?\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
if m_lect:
lecturers = m_lect.group(1).strip(" .").replace(" ,", ",")
# Notes: look for two-yearly etc.
notes = None
if re.search(r"Tweejaarlijks", text, flags=re.I):
# Try to capture the "even/oneven" phrasing
m_ev = re.search(r"tweejaarlijks[^.]*?(even|oneven)[^.]*jaar", text, flags=re.I)
notes = "Tweejaarlijks" + (f" ({m_ev.group(1).lower()} jaar)" if m_ev else "")
return sem, credits, lecturers, notes
def nearest_sections(a_tag) -> Tuple[Optional[str], Optional[str], Optional[str]]:
"""
Find nearest preceding headings to classify the row.
Returns (section, subsection, pillar)
- section: e.g., 'Modeltraject deel 1/2/3'
- subsection: e.g., 'Wijsbegeerte en sociale wetenschappen', 'Keuzeopleidingsonderdelen', 'Inleiding tot de geschiedenis', etc.
- pillar: for deel 3: 'Chronologische pijler', 'Thematische pijler' or None
"""
# The site uses a variety of headings (h2, h3, h4); we trace back to find labels
h = a_tag.find_previous(["h2", "h3", "h4", "h5"])
section = subsection = pillar = None
# Walk up multiple previous headings to capture a hierarchy
prev_heads = []
cur = a_tag
for _ in range(40): # limit walk to avoid infinite loops
cur = cur.find_previous(["h2", "h3", "h4", "h5"])
if not cur:
break
txt = extract_text(cur)
prev_heads.append(txt)
# Determine labels from the nearest few headings
for txt in prev_heads:
t = txt.lower()
if section is None and "modeltraject deel" in t:
# Normalize like "Modeltraject deel 1"
section = txt
if subsection is None:
# Typical subsections
if any(k in t for k in [
"wijsbegeerte en sociale wetenschappen",
"methodologie van de geschiedenis",
"historische oefeningen",
"inleiding tot de geschiedenis",
"heuristiek",
"historisch overzicht",
"keuzeopleidingsonderdelen",
"sociale wetenschappen",
]):
subsection = txt
if pillar is None and ("chronologische pijler" in t or "thematische pijler" in t):
pillar = txt
if section and (subsection or pillar):
# Good enough
break
return section, subsection, pillar
def parse_courses_from_page(soup: BeautifulSoup) -> pd.DataFrame:
"""
Parse all course links for the 2025-xxxx academic year, infer metadata from nearby text,
and return a normalized DataFrame.
"""
rows = []
# Capture all anchors that look like course links containing '?id=2025-<CODE>'
for a in soup.find_all("a", href=True):
href = a["href"]
# Normalize relative links
full_url = urllib.parse.urljoin(UA_BA_URL, href)
# Filter by the 'id=2025-' parameter (2025-2026)
if "id=" in href:
q = urllib.parse.urlparse(href).query
params = urllib.parse.parse_qs(q)
ids = params.get("id", [])
if not ids:
continue
# Some pages use '2025-XXXXX' or '2025-XXXXX&lang=nl'
if not any(idv.startswith(TARGET_YEAR_PREFIX) for idv in ids):
continue
course_id = ids[0] # e.g., '2025-1002FLWGES'
else:
# No id=... parameter; skip
continue
# Extract code after '2025-'
code = None
m = re.match(r"2025-([A-Za-z0-9]+)", course_id)
if m:
code = m.group(1)
name = extract_text(a).strip()
if not name or not code:
continue
# Use a reasonably large ancestor block for metadata search
container = a
for _ in range(4):
if container.parent:
container = container.parent
block_text = extract_text(container)
semester, credits, lecturers, notes = parse_meta_from_block(block_text)
section, subsection, pillar = nearest_sections(a)
rows.append({
"Section": section,
"Subsection": subsection,
"Pillar": pillar,
"Course Code": code,
"Course Name": name,
"URL": full_url,
"Semester": semester,
"Credits": credits,
"Lecturers": lecturers,
"Notes": notes
})
df = pd.DataFrame(rows).drop_duplicates(subset=["Course Code", "Course Name"])
# Keep only rows that clearly belong to the 'Bachelor' page; sometimes cross-links appear
# Heuristic: we keep rows with a Section that starts with "Modeltraject deel" or that have a Pillar marker
mask = (
df["Section"].fillna("").str.contains(r"Modeltraject deel", case=False) |
df["Pillar"].fillna("").str.contains(r"Pijler", case=False)
)
df = df[mask].copy()
# Clean up text for consistency
def clean_col(s):
return s.str.replace(r"\s+", " ", regex=True).str.strip()
for col in ["Section", "Subsection", "Pillar", "Course Name", "Lecturers", "Notes"]:
df[col] = clean_col(df[col].astype(str))
# Ensure missing pillar/subsection are empty strings for consistent sorting
df["Pillar"] = df["Pillar"].fillna("")
df["Subsection"] = df["Subsection"].fillna("")
# Sort for readability: section → pillar → subsection → name
df.sort_values(
by=["Section", "Pillar", "Subsection", "Course Name"],
inplace=True
)
df.reset_index(drop=True, inplace=True)
return df
def archive_and_write(excel_path: str, df: pd.DataFrame, target_sheet: str):
"""
- If sheet 'BAGES' exists, rename it to 'BAGES_OLD_YYYYMMDD'
- Write df to 'BAGES'
"""
try:
wb = load_workbook(excel_path)
except FileNotFoundError:
print(f"[INFO] File not found, creating new workbook: {excel_path}")
# Write a new file straight away
with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
df.to_excel(writer, sheet_name=target_sheet, index=False)
return
except InvalidFileException:
print(f"[ERROR] Not a valid Excel file: {excel_path}")
sys.exit(1)
# Rename existing BAGES to archive
if target_sheet in wb.sheetnames:
date_suffix = datetime.now().strftime("%Y%m%d")
archive_name = ARCHIVE_PREFIX + date_suffix
# Ensure uniqueness (append a counter if necessary)
counter = 1
final_archive = archive_name
while final_archive in wb.sheetnames:
counter += 1
final_archive = f"{archive_name}_{counter}"
ws = wb[target_sheet]
ws.title = final_archive
print(f"[INFO] Archived existing '{target_sheet}' as '{final_archive}'")
# Save intermediate
wb.save(excel_path)
# Now write the new sheet
with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
df.to_excel(writer, sheet_name=target_sheet, index=False)
print(f"[SUCCESS] Wrote updated '{target_sheet}' sheet to '{excel_path}'")
# Also save the updated DataFrame to a separate new Excel file for convenience
try:
src = Path(excel_path)
new_name = src.with_name(f"{src.stem}_updated{src.suffix}")
# Write a fresh workbook containing only the updated sheet
with pd.ExcelWriter(str(new_name), engine="openpyxl") as writer:
df.to_excel(writer, sheet_name=target_sheet, index=False)
print(f"[INFO] Also wrote updated data to new file '{new_name}'")
except Exception as e:
print(f"[WARNING] Could not write updated copy to new file: {e}")
def main():
print("[STEP 1] Fetching the bachelor study programme page (2025-2026)…")
soup = fetch_html(UA_BA_URL)
time.sleep(0.3)
print("[STEP 2] Parsing courses and metadata (this may take a few seconds)…")
df = parse_courses_from_page(soup)
if df.empty:
print("[WARNING] No 2025-xxxx course rows found. The page structure may have changed.")
print(" Please open the URL in a browser and check if '2025-2026' content is visible.")
else:
# Sanity: flag pillar rows (deel 3) visibly
df["Pillar"] = df["Pillar"].replace({"": None})
print(f"[INFO] Parsed {len(df)} course rows for 2025-2026.")
# Optional: give you a quick view in console
head = df.head(10).to_string(index=False)
print("[PREVIEW]\n" + head)
print(f"[STEP 3] Updating Excel: {EXCEL_PATH}")
archive_and_write(EXCEL_PATH, df, TARGET_SHEET)
print("\nDone. You can now open the workbook and review the refreshed 'BAGES' sheet.")
if __name__ == "__main__":
main()
# Try to save a copy of the corrected script beside the original.
def save_copy(dest_name: str = "script_fixed.py"):
try:
import pathlib
src = pathlib.Path(__file__)
dst = src.with_name(dest_name)
dst.write_text(src.read_text(encoding="utf-8"), encoding="utf-8")
print(f"[INFO] Wrote a copy of this script to '{dst}'")
except Exception as e:
print(f"[WARNING] Could not write copy: {e}")
save_copy()

Binary file not shown.

View File

@ -181,22 +181,49 @@ def check_students_with_mismatching_SP_values(predelib_df: pd.DataFrame) -> List
logger.info("All required columns found in dataframe")
# Check for mismatching SP values
mismatching_students = []
for index, row in predelib_df.iterrows():
if row['Totaal aantal SP'] != row['Aantal SP vereist']:
mismatching_students.append({
'ID': row['ID'],
'Achternaam': row['Achternaam'],
'Voornaam': row['Voornaam'],
'E-mail': row['E-mail'],
'Totaal_aantal_SP': row['Totaal aantal SP'],
'Aantal_SP_vereist': row['Aantal SP vereist'],
'Waarschuwing': row['Waarschuwing'],
'Adviesrapport_code': row['Adviesrapport code']
})
# Use vectorized comparison to find rows where the SP values differ
sp_col = predelib_df['Totaal aantal SP']
req_col = predelib_df['Aantal SP vereist']
logger.info(f"Found {len(mismatching_students)} students with mismatching SP values")
# Simple inequality works for most cases; NaN != NaN will be True which is acceptable
mask = sp_col != req_col
mismatches_df = predelib_df[mask].copy()
logger.info(f"Found {len(mismatches_df)} raw rows with mismatching SP values")
if mismatches_df.empty:
logger.info("No students with mismatching SP values found")
return []
# Keep only unique students by 'ID' (first occurrence).
if 'ID' in mismatches_df.columns:
before_dedup = len(mismatches_df)
mismatches_df = mismatches_df.drop_duplicates(subset=['ID'])
after_dedup = len(mismatches_df)
logger.info(f"Reduced from {before_dedup} rows to {after_dedup} unique students by ID")
else:
logger.warning("Column 'ID' not found - cannot deduplicate by student ID")
# Ensure optional columns exist to avoid KeyError when building dicts
for optional_col in ('Waarschuwing', 'Adviesrapport code'):
if optional_col not in mismatches_df.columns:
mismatches_df[optional_col] = None
# Build the list of mismatching students
mismatching_students = []
for _, row in mismatches_df.iterrows():
mismatching_students.append({
'ID': row.get('ID'),
'Achternaam': row.get('Achternaam'),
'Voornaam': row.get('Voornaam'),
'E-mail': row.get('E-mail'),
'Totaal_aantal_SP': row.get('Totaal aantal SP'),
'Aantal_SP_vereist': row.get('Aantal SP vereist'),
'Waarschuwing': row.get('Waarschuwing'),
'Adviesrapport_code': row.get('Adviesrapport code')
})
logger.info(f"Returning {len(mismatching_students)} unique students with mismatching SP values")
return mismatching_students
except Exception as e:

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +0,0 @@
extracurriculaire vakken komen niet uit de wizard. aparte excel voor nodig, nl dashboard inschrijvingen.
Voor de check of ze extra keuzevakken opnemen en de vereiste dus hoger moet komen te staan: vgl de kollomen Totaal aantal SP Aantal SP vereist
deze kolom moet ook worden gechecked als ze eigenlijk minder opnemen; dus die moeten altijd aan elkaar gelijk zijn.