ows-master/ongeloofelijken tool/script_fixed.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Update 'BAGES' sheet in 'ongeloofelijken tool.xlsx' with the latest (2025-2026) bachelor History
study programme from UAntwerpen. It scrapes the official page and writes a normalized table.

Source page (2025-2026 bachelor study programme):
https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/
- In 2025-2026 the 'Geschiedenis per periode en gebied' structure changed to a two-pillar model:
  * Chronologische pijler: 3 OOs (middeleeuwen, nieuwe tijd, nieuwste tijd)
  * Thematische pijler: 2 OOs
(See faculty helpdesk note with change summary and transition measures.)

IMPORTANT:
- This script only updates the 'BAGES' (Bachelor) sheet, because the provided link covers the bachelor page.
- 'MAGES' and 'SPVP' sheets remain untouched.

Tested with: requests, beautifulsoup4, lxml, pandas, openpyxl
"""

import re
import sys
import time
import urllib.parse
from datetime import datetime
from pathlib import Path
from typing import Optional, Tuple

import requests
import pandas as pd
from bs4 import BeautifulSoup
from openpyxl import load_workbook
from openpyxl.utils.exceptions import InvalidFileException


# ------------------------- Configuration -------------------------
EXCEL_PATH = "ongeloofelijken tool.xlsx"
TARGET_SHEET = "BAGES"
ARCHIVE_PREFIX = "BAGES_OLD_"
UA_BA_URL = "https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/"
TARGET_YEAR_PREFIX = "2025-"   # Anchor/course URLs have '?id=<year>-<code>'; we filter with '2025-'
TIMEOUT = 30
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; uantwerpen-bages-updater/1.0; +https://www.uantwerpen.be/)",
    "Accept-Language": "nl,en;q=0.8"
}


# ------------------------- Helpers -------------------------
def fetch_html(url: str) -> BeautifulSoup:
    resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "lxml")


def extract_text(el) -> str:
    return re.sub(r"\s+", " ", " ".join(el.stripped_strings)) if el else ""


def parse_meta_from_block(block_text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
    """
    Try to parse Semester, Credits, Lecturers, Notes from a block of text next to a course link.
    Returns (semester, credits, lecturers, notes)
    """
    text = block_text

    # Semester examples: '1E SEM', '2E SEM', '1E/2E SEM'
    sem = None
    m_sem = re.search(r"\b(1E\s*/\s*2E\s*SEM|1E\s*SEM|2E\s*SEM)\b", text, flags=re.I)
    if m_sem:
        sem = m_sem.group(1).upper().replace(" ", "")

    # Credits examples: '6 studiepunten', '3 studiepunten'
    credits = None
    m_sp = re.search(r"(\d+)\s*studiepunten", text, flags=re.I)
    if m_sp:
        credits = m_sp.group(1)

    # Lecturers: after 'Lesgever (s):' or 'Lesgever(s):'
    lecturers = None
    m_lect = re.search(r"Lesgever\s*\(s\)\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
    if not m_lect:
        m_lect = re.search(r"Lesgever[s]?\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
    if m_lect:
        lecturers = m_lect.group(1).strip(" .").replace(" ,", ",")

    # Notes: look for two-yearly etc.
    notes = None
    if re.search(r"Tweejaarlijks", text, flags=re.I):
        # Try to capture the "even/oneven" phrasing
        m_ev = re.search(r"tweejaarlijks[^.]*?(even|oneven)[^.]*jaar", text, flags=re.I)
        notes = "Tweejaarlijks" + (f" ({m_ev.group(1).lower()} jaar)" if m_ev else "")

    return sem, credits, lecturers, notes


def nearest_sections(a_tag) -> Tuple[Optional[str], Optional[str], Optional[str]]:
    """
    Find nearest preceding headings to classify the row.
    Returns (section, subsection, pillar)
    - section: e.g., 'Modeltraject deel 1/2/3'
    - subsection: e.g., 'Wijsbegeerte en sociale wetenschappen', 'Keuzeopleidingsonderdelen', 'Inleiding tot de geschiedenis', etc.
    - pillar: for deel 3: 'Chronologische pijler', 'Thematische pijler' or None
    """
    # The site uses a variety of headings (h2, h3, h4); we trace back to find labels
    h = a_tag.find_previous(["h2", "h3", "h4", "h5"])
    section = subsection = pillar = None

    # Walk up multiple previous headings to capture a hierarchy
    prev_heads = []
    cur = a_tag
    for _ in range(40):  # limit walk to avoid infinite loops
        cur = cur.find_previous(["h2", "h3", "h4", "h5"])
        if not cur:
            break
        txt = extract_text(cur)
        prev_heads.append(txt)

    # Determine labels from the nearest few headings
    for txt in prev_heads:
        t = txt.lower()
        if section is None and "modeltraject deel" in t:
            # Normalize like "Modeltraject deel 1"
            section = txt
        if subsection is None:
            # Typical subsections
            if any(k in t for k in [
                "wijsbegeerte en sociale wetenschappen",
                "methodologie van de geschiedenis",
                "historische oefeningen",
                "inleiding tot de geschiedenis",
                "heuristiek",
                "historisch overzicht",
                "keuzeopleidingsonderdelen",
                "sociale wetenschappen",
            ]):
                subsection = txt
        if pillar is None and ("chronologische pijler" in t or "thematische pijler" in t):
            pillar = txt

        if section and (subsection or pillar):
            # Good enough
            break

    return section, subsection, pillar


def parse_courses_from_page(soup: BeautifulSoup) -> pd.DataFrame:
    """
    Parse all course links for the 2025-xxxx academic year, infer metadata from nearby text,
    and return a normalized DataFrame.
    """
    rows = []

    # Capture all anchors that look like course links containing '?id=2025-<CODE>'
    for a in soup.find_all("a", href=True):
        href = a["href"]
        # Normalize relative links
        full_url = urllib.parse.urljoin(UA_BA_URL, href)
        # Filter by the 'id=2025-' parameter (2025-2026)
        if "id=" in href:
            q = urllib.parse.urlparse(href).query
            params = urllib.parse.parse_qs(q)
            ids = params.get("id", [])
            if not ids:
                continue
            # Some pages use '2025-XXXXX' or '2025-XXXXX&lang=nl'
            if not any(idv.startswith(TARGET_YEAR_PREFIX) for idv in ids):
                continue
            course_id = ids[0]  # e.g., '2025-1002FLWGES'
        else:
            # No id=... parameter; skip
            continue

        # Extract code after '2025-'
        code = None
        m = re.match(r"2025-([A-Za-z0-9]+)", course_id)
        if m:
            code = m.group(1)

        name = extract_text(a).strip()
        if not name or not code:
            continue

        # Use a reasonably large ancestor block for metadata search
        container = a
        for _ in range(4):
            if container.parent:
                container = container.parent
        block_text = extract_text(container)

        semester, credits, lecturers, notes = parse_meta_from_block(block_text)
        section, subsection, pillar = nearest_sections(a)

        rows.append({
            "Section": section,
            "Subsection": subsection,
            "Pillar": pillar,
            "Course Code": code,
            "Course Name": name,
            "URL": full_url,
            "Semester": semester,
            "Credits": credits,
            "Lecturers": lecturers,
            "Notes": notes
        })

    df = pd.DataFrame(rows).drop_duplicates(subset=["Course Code", "Course Name"])
    # Keep only rows that clearly belong to the 'Bachelor' page; sometimes cross-links appear
    # Heuristic: we keep rows with a Section that starts with "Modeltraject deel" or that have a Pillar marker
    mask = (
        df["Section"].fillna("").str.contains(r"Modeltraject deel", case=False) |
        df["Pillar"].fillna("").str.contains(r"Pijler", case=False)
    )
    df = df[mask].copy()

    # Clean up text for consistency
    def clean_col(s):
        return s.str.replace(r"\s+", " ", regex=True).str.strip()

    for col in ["Section", "Subsection", "Pillar", "Course Name", "Lecturers", "Notes"]:
        df[col] = clean_col(df[col].astype(str))

    # Ensure missing pillar/subsection are empty strings for consistent sorting
    df["Pillar"] = df["Pillar"].fillna("")
    df["Subsection"] = df["Subsection"].fillna("")

    # Sort for readability: section → pillar → subsection → name
    df.sort_values(
        by=["Section", "Pillar", "Subsection", "Course Name"],
        inplace=True
    )
    df.reset_index(drop=True, inplace=True)
    return df


def archive_and_write(excel_path: str, df: pd.DataFrame, target_sheet: str):
    """
    - If sheet 'BAGES' exists, rename it to 'BAGES_OLD_YYYYMMDD'
    - Write df to 'BAGES'
    """
    try:
        wb = load_workbook(excel_path)
    except FileNotFoundError:
        print(f"[INFO] File not found, creating new workbook: {excel_path}")
        # Write a new file straight away
        with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
            df.to_excel(writer, sheet_name=target_sheet, index=False)
        return
    except InvalidFileException:
        print(f"[ERROR] Not a valid Excel file: {excel_path}")
        sys.exit(1)

    # Rename existing BAGES to archive
    if target_sheet in wb.sheetnames:
        date_suffix = datetime.now().strftime("%Y%m%d")
        archive_name = ARCHIVE_PREFIX + date_suffix
        # Ensure uniqueness (append a counter if necessary)
        counter = 1
        final_archive = archive_name
        while final_archive in wb.sheetnames:
            counter += 1
            final_archive = f"{archive_name}_{counter}"
        ws = wb[target_sheet]
        ws.title = final_archive
        print(f"[INFO] Archived existing '{target_sheet}' as '{final_archive}'")

    # Save intermediate
    wb.save(excel_path)

    # Now write the new sheet
    with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
        df.to_excel(writer, sheet_name=target_sheet, index=False)

    print(f"[SUCCESS] Wrote updated '{target_sheet}' sheet to '{excel_path}'")
    # Also save the updated DataFrame to a separate new Excel file for convenience
    try:
        src = Path(excel_path)
        new_name = src.with_name(f"{src.stem}_updated{src.suffix}")
        # Write a fresh workbook containing only the updated sheet
        with pd.ExcelWriter(str(new_name), engine="openpyxl") as writer:
            df.to_excel(writer, sheet_name=target_sheet, index=False)
        print(f"[INFO] Also wrote updated data to new file '{new_name}'")
    except Exception as e:
        print(f"[WARNING] Could not write updated copy to new file: {e}")


def main():
    print("[STEP 1] Fetching the bachelor study programme page (2025-2026)…")
    soup = fetch_html(UA_BA_URL)
    time.sleep(0.3)

    print("[STEP 2] Parsing courses and metadata (this may take a few seconds)…")
    df = parse_courses_from_page(soup)
    if df.empty:
        print("[WARNING] No 2025-xxxx course rows found. The page structure may have changed.")
        print("          Please open the URL in a browser and check if '2025-2026' content is visible.")
    else:
        # Sanity: flag pillar rows (deel 3) visibly
        df["Pillar"] = df["Pillar"].replace({"": None})
        print(f"[INFO] Parsed {len(df)} course rows for 2025-2026.")

    # Optional: give you a quick view in console
    head = df.head(10).to_string(index=False)
    print("[PREVIEW]\n" + head)

    print(f"[STEP 3] Updating Excel: {EXCEL_PATH}")
    archive_and_write(EXCEL_PATH, df, TARGET_SHEET)

    print("\nDone. You can now open the workbook and review the refreshed 'BAGES' sheet.")


if __name__ == "__main__":
    main()
    # Try to save a copy of the corrected script beside the original.
    def save_copy(dest_name: str = "script_fixed.py"):
        try:
            import pathlib
            src = pathlib.Path(__file__)
            dst = src.with_name(dest_name)
            dst.write_text(src.read_text(encoding="utf-8"), encoding="utf-8")
            print(f"[INFO] Wrote a copy of this script to '{dst}'")
        except Exception as e:
            print(f"[WARNING] Could not write copy: {e}")

    save_copy()