#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Update 'BAGES' sheet in 'ongeloofelijken tool.xlsx' with the latest (2025-2026) bachelor History study programme from UAntwerpen. It scrapes the official page and writes a normalized table. Source page (2025-2026 bachelor study programme): https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/ - In 2025-2026 the 'Geschiedenis per periode en gebied' structure changed to a two-pillar model: * Chronologische pijler: 3 OOs (middeleeuwen, nieuwe tijd, nieuwste tijd) * Thematische pijler: 2 OOs (See faculty helpdesk note with change summary and transition measures.) IMPORTANT: - This script only updates the 'BAGES' (Bachelor) sheet, because the provided link covers the bachelor page. - 'MAGES' and 'SPVP' sheets remain untouched. Tested with: requests, beautifulsoup4, lxml, pandas, openpyxl """ import re import sys import time import urllib.parse from datetime import datetime from pathlib import Path from typing import Optional, Tuple import requests import pandas as pd from bs4 import BeautifulSoup from openpyxl import load_workbook from openpyxl.utils.exceptions import InvalidFileException # ------------------------- Configuration ------------------------- EXCEL_PATH = "ongeloofelijken tool.xlsx" TARGET_SHEET = "BAGES" ARCHIVE_PREFIX = "BAGES_OLD_" UA_BA_URL = "https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/" TARGET_YEAR_PREFIX = "2025-" # Anchor/course URLs have '?id=-'; we filter with '2025-' TIMEOUT = 30 HEADERS = { "User-Agent": "Mozilla/5.0 (compatible; uantwerpen-bages-updater/1.0; +https://www.uantwerpen.be/)", "Accept-Language": "nl,en;q=0.8" } # ------------------------- Helpers ------------------------- def fetch_html(url: str) -> BeautifulSoup: resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT) resp.raise_for_status() return BeautifulSoup(resp.text, "lxml") def extract_text(el) -> str: return re.sub(r"\s+", " ", " ".join(el.stripped_strings)) if el else "" def parse_meta_from_block(block_text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]: """ Try to parse Semester, Credits, Lecturers, Notes from a block of text next to a course link. Returns (semester, credits, lecturers, notes) """ text = block_text # Semester examples: '1E SEM', '2E SEM', '1E/2E SEM' sem = None m_sem = re.search(r"\b(1E\s*/\s*2E\s*SEM|1E\s*SEM|2E\s*SEM)\b", text, flags=re.I) if m_sem: sem = m_sem.group(1).upper().replace(" ", "") # Credits examples: '6 studiepunten', '3 studiepunten' credits = None m_sp = re.search(r"(\d+)\s*studiepunten", text, flags=re.I) if m_sp: credits = m_sp.group(1) # Lecturers: after 'Lesgever (s):' or 'Lesgever(s):' lecturers = None m_lect = re.search(r"Lesgever\s*\(s\)\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I) if not m_lect: m_lect = re.search(r"Lesgever[s]?\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I) if m_lect: lecturers = m_lect.group(1).strip(" .").replace(" ,", ",") # Notes: look for two-yearly etc. notes = None if re.search(r"Tweejaarlijks", text, flags=re.I): # Try to capture the "even/oneven" phrasing m_ev = re.search(r"tweejaarlijks[^.]*?(even|oneven)[^.]*jaar", text, flags=re.I) notes = "Tweejaarlijks" + (f" ({m_ev.group(1).lower()} jaar)" if m_ev else "") return sem, credits, lecturers, notes def nearest_sections(a_tag) -> Tuple[Optional[str], Optional[str], Optional[str]]: """ Find nearest preceding headings to classify the row. Returns (section, subsection, pillar) - section: e.g., 'Modeltraject deel 1/2/3' - subsection: e.g., 'Wijsbegeerte en sociale wetenschappen', 'Keuzeopleidingsonderdelen', 'Inleiding tot de geschiedenis', etc. - pillar: for deel 3: 'Chronologische pijler', 'Thematische pijler' or None """ # The site uses a variety of headings (h2, h3, h4); we trace back to find labels h = a_tag.find_previous(["h2", "h3", "h4", "h5"]) section = subsection = pillar = None # Walk up multiple previous headings to capture a hierarchy prev_heads = [] cur = a_tag for _ in range(40): # limit walk to avoid infinite loops cur = cur.find_previous(["h2", "h3", "h4", "h5"]) if not cur: break txt = extract_text(cur) prev_heads.append(txt) # Determine labels from the nearest few headings for txt in prev_heads: t = txt.lower() if section is None and "modeltraject deel" in t: # Normalize like "Modeltraject deel 1" section = txt if subsection is None: # Typical subsections if any(k in t for k in [ "wijsbegeerte en sociale wetenschappen", "methodologie van de geschiedenis", "historische oefeningen", "inleiding tot de geschiedenis", "heuristiek", "historisch overzicht", "keuzeopleidingsonderdelen", "sociale wetenschappen", ]): subsection = txt if pillar is None and ("chronologische pijler" in t or "thematische pijler" in t): pillar = txt if section and (subsection or pillar): # Good enough break return section, subsection, pillar def parse_courses_from_page(soup: BeautifulSoup) -> pd.DataFrame: """ Parse all course links for the 2025-xxxx academic year, infer metadata from nearby text, and return a normalized DataFrame. """ rows = [] # Capture all anchors that look like course links containing '?id=2025-' for a in soup.find_all("a", href=True): href = a["href"] # Normalize relative links full_url = urllib.parse.urljoin(UA_BA_URL, href) # Filter by the 'id=2025-' parameter (2025-2026) if "id=" in href: q = urllib.parse.urlparse(href).query params = urllib.parse.parse_qs(q) ids = params.get("id", []) if not ids: continue # Some pages use '2025-XXXXX' or '2025-XXXXX&lang=nl' if not any(idv.startswith(TARGET_YEAR_PREFIX) for idv in ids): continue course_id = ids[0] # e.g., '2025-1002FLWGES' else: # No id=... parameter; skip continue # Extract code after '2025-' code = None m = re.match(r"2025-([A-Za-z0-9]+)", course_id) if m: code = m.group(1) name = extract_text(a).strip() if not name or not code: continue # Use a reasonably large ancestor block for metadata search container = a for _ in range(4): if container.parent: container = container.parent block_text = extract_text(container) semester, credits, lecturers, notes = parse_meta_from_block(block_text) section, subsection, pillar = nearest_sections(a) rows.append({ "Section": section, "Subsection": subsection, "Pillar": pillar, "Course Code": code, "Course Name": name, "URL": full_url, "Semester": semester, "Credits": credits, "Lecturers": lecturers, "Notes": notes }) df = pd.DataFrame(rows).drop_duplicates(subset=["Course Code", "Course Name"]) # Keep only rows that clearly belong to the 'Bachelor' page; sometimes cross-links appear # Heuristic: we keep rows with a Section that starts with "Modeltraject deel" or that have a Pillar marker mask = ( df["Section"].fillna("").str.contains(r"Modeltraject deel", case=False) | df["Pillar"].fillna("").str.contains(r"Pijler", case=False) ) df = df[mask].copy() # Clean up text for consistency def clean_col(s): return s.str.replace(r"\s+", " ", regex=True).str.strip() for col in ["Section", "Subsection", "Pillar", "Course Name", "Lecturers", "Notes"]: df[col] = clean_col(df[col].astype(str)) # Ensure missing pillar/subsection are empty strings for consistent sorting df["Pillar"] = df["Pillar"].fillna("") df["Subsection"] = df["Subsection"].fillna("") # Sort for readability: section → pillar → subsection → name df.sort_values( by=["Section", "Pillar", "Subsection", "Course Name"], inplace=True ) df.reset_index(drop=True, inplace=True) return df def archive_and_write(excel_path: str, df: pd.DataFrame, target_sheet: str): """ - If sheet 'BAGES' exists, rename it to 'BAGES_OLD_YYYYMMDD' - Write df to 'BAGES' """ try: wb = load_workbook(excel_path) except FileNotFoundError: print(f"[INFO] File not found, creating new workbook: {excel_path}") # Write a new file straight away with pd.ExcelWriter(excel_path, engine="openpyxl") as writer: df.to_excel(writer, sheet_name=target_sheet, index=False) return except InvalidFileException: print(f"[ERROR] Not a valid Excel file: {excel_path}") sys.exit(1) # Rename existing BAGES to archive if target_sheet in wb.sheetnames: date_suffix = datetime.now().strftime("%Y%m%d") archive_name = ARCHIVE_PREFIX + date_suffix # Ensure uniqueness (append a counter if necessary) counter = 1 final_archive = archive_name while final_archive in wb.sheetnames: counter += 1 final_archive = f"{archive_name}_{counter}" ws = wb[target_sheet] ws.title = final_archive print(f"[INFO] Archived existing '{target_sheet}' as '{final_archive}'") # Save intermediate wb.save(excel_path) # Now write the new sheet with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer: df.to_excel(writer, sheet_name=target_sheet, index=False) print(f"[SUCCESS] Wrote updated '{target_sheet}' sheet to '{excel_path}'") # Also save the updated DataFrame to a separate new Excel file for convenience try: src = Path(excel_path) new_name = src.with_name(f"{src.stem}_updated{src.suffix}") # Write a fresh workbook containing only the updated sheet with pd.ExcelWriter(str(new_name), engine="openpyxl") as writer: df.to_excel(writer, sheet_name=target_sheet, index=False) print(f"[INFO] Also wrote updated data to new file '{new_name}'") except Exception as e: print(f"[WARNING] Could not write updated copy to new file: {e}") def main(): print("[STEP 1] Fetching the bachelor study programme page (2025-2026)…") soup = fetch_html(UA_BA_URL) time.sleep(0.3) print("[STEP 2] Parsing courses and metadata (this may take a few seconds)…") df = parse_courses_from_page(soup) if df.empty: print("[WARNING] No 2025-xxxx course rows found. The page structure may have changed.") print(" Please open the URL in a browser and check if '2025-2026' content is visible.") else: # Sanity: flag pillar rows (deel 3) visibly df["Pillar"] = df["Pillar"].replace({"": None}) print(f"[INFO] Parsed {len(df)} course rows for 2025-2026.") # Optional: give you a quick view in console head = df.head(10).to_string(index=False) print("[PREVIEW]\n" + head) print(f"[STEP 3] Updating Excel: {EXCEL_PATH}") archive_and_write(EXCEL_PATH, df, TARGET_SHEET) print("\nDone. You can now open the workbook and review the refreshed 'BAGES' sheet.") if __name__ == "__main__": main() # Try to save a copy of the corrected script beside the original. def save_copy(dest_name: str = "script_fixed.py"): try: import pathlib src = pathlib.Path(__file__) dst = src.with_name(dest_name) dst.write_text(src.read_text(encoding="utf-8"), encoding="utf-8") print(f"[INFO] Wrote a copy of this script to '{dst}'") except Exception as e: print(f"[WARNING] Could not write copy: {e}") save_copy()