327 lines
12 KiB
Python
327 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Update 'BAGES' sheet in 'ongeloofelijken tool.xlsx' with the latest (2025-2026) bachelor History
|
|
study programme from UAntwerpen. It scrapes the official page and writes a normalized table.
|
|
|
|
Source page (2025-2026 bachelor study programme):
|
|
https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/
|
|
- In 2025-2026 the 'Geschiedenis per periode en gebied' structure changed to a two-pillar model:
|
|
* Chronologische pijler: 3 OOs (middeleeuwen, nieuwe tijd, nieuwste tijd)
|
|
* Thematische pijler: 2 OOs
|
|
(See faculty helpdesk note with change summary and transition measures.)
|
|
|
|
IMPORTANT:
|
|
- This script only updates the 'BAGES' (Bachelor) sheet, because the provided link covers the bachelor page.
|
|
- 'MAGES' and 'SPVP' sheets remain untouched.
|
|
|
|
Tested with: requests, beautifulsoup4, lxml, pandas, openpyxl
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
import time
|
|
import urllib.parse
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Optional, Tuple
|
|
|
|
import requests
|
|
import pandas as pd
|
|
from bs4 import BeautifulSoup
|
|
from openpyxl import load_workbook
|
|
from openpyxl.utils.exceptions import InvalidFileException
|
|
|
|
|
|
# ------------------------- Configuration -------------------------
|
|
EXCEL_PATH = "ongeloofelijken tool.xlsx"
|
|
TARGET_SHEET = "BAGES"
|
|
ARCHIVE_PREFIX = "BAGES_OLD_"
|
|
UA_BA_URL = "https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/"
|
|
TARGET_YEAR_PREFIX = "2025-" # Anchor/course URLs have '?id=<year>-<code>'; we filter with '2025-'
|
|
TIMEOUT = 30
|
|
HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (compatible; uantwerpen-bages-updater/1.0; +https://www.uantwerpen.be/)",
|
|
"Accept-Language": "nl,en;q=0.8"
|
|
}
|
|
|
|
|
|
# ------------------------- Helpers -------------------------
|
|
def fetch_html(url: str) -> BeautifulSoup:
|
|
resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
|
|
resp.raise_for_status()
|
|
return BeautifulSoup(resp.text, "lxml")
|
|
|
|
|
|
def extract_text(el) -> str:
|
|
return re.sub(r"\s+", " ", " ".join(el.stripped_strings)) if el else ""
|
|
|
|
|
|
def parse_meta_from_block(block_text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
|
|
"""
|
|
Try to parse Semester, Credits, Lecturers, Notes from a block of text next to a course link.
|
|
Returns (semester, credits, lecturers, notes)
|
|
"""
|
|
text = block_text
|
|
|
|
# Semester examples: '1E SEM', '2E SEM', '1E/2E SEM'
|
|
sem = None
|
|
m_sem = re.search(r"\b(1E\s*/\s*2E\s*SEM|1E\s*SEM|2E\s*SEM)\b", text, flags=re.I)
|
|
if m_sem:
|
|
sem = m_sem.group(1).upper().replace(" ", "")
|
|
|
|
# Credits examples: '6 studiepunten', '3 studiepunten'
|
|
credits = None
|
|
m_sp = re.search(r"(\d+)\s*studiepunten", text, flags=re.I)
|
|
if m_sp:
|
|
credits = m_sp.group(1)
|
|
|
|
# Lecturers: after 'Lesgever (s):' or 'Lesgever(s):'
|
|
lecturers = None
|
|
m_lect = re.search(r"Lesgever\s*\(s\)\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
|
|
if not m_lect:
|
|
m_lect = re.search(r"Lesgever[s]?\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
|
|
if m_lect:
|
|
lecturers = m_lect.group(1).strip(" .").replace(" ,", ",")
|
|
|
|
# Notes: look for two-yearly etc.
|
|
notes = None
|
|
if re.search(r"Tweejaarlijks", text, flags=re.I):
|
|
# Try to capture the "even/oneven" phrasing
|
|
m_ev = re.search(r"tweejaarlijks[^.]*?(even|oneven)[^.]*jaar", text, flags=re.I)
|
|
notes = "Tweejaarlijks" + (f" ({m_ev.group(1).lower()} jaar)" if m_ev else "")
|
|
|
|
return sem, credits, lecturers, notes
|
|
|
|
|
|
def nearest_sections(a_tag) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
|
"""
|
|
Find nearest preceding headings to classify the row.
|
|
Returns (section, subsection, pillar)
|
|
- section: e.g., 'Modeltraject deel 1/2/3'
|
|
- subsection: e.g., 'Wijsbegeerte en sociale wetenschappen', 'Keuzeopleidingsonderdelen', 'Inleiding tot de geschiedenis', etc.
|
|
- pillar: for deel 3: 'Chronologische pijler', 'Thematische pijler' or None
|
|
"""
|
|
# The site uses a variety of headings (h2, h3, h4); we trace back to find labels
|
|
h = a_tag.find_previous(["h2", "h3", "h4", "h5"])
|
|
section = subsection = pillar = None
|
|
|
|
# Walk up multiple previous headings to capture a hierarchy
|
|
prev_heads = []
|
|
cur = a_tag
|
|
for _ in range(40): # limit walk to avoid infinite loops
|
|
cur = cur.find_previous(["h2", "h3", "h4", "h5"])
|
|
if not cur:
|
|
break
|
|
txt = extract_text(cur)
|
|
prev_heads.append(txt)
|
|
|
|
# Determine labels from the nearest few headings
|
|
for txt in prev_heads:
|
|
t = txt.lower()
|
|
if section is None and "modeltraject deel" in t:
|
|
# Normalize like "Modeltraject deel 1"
|
|
section = txt
|
|
if subsection is None:
|
|
# Typical subsections
|
|
if any(k in t for k in [
|
|
"wijsbegeerte en sociale wetenschappen",
|
|
"methodologie van de geschiedenis",
|
|
"historische oefeningen",
|
|
"inleiding tot de geschiedenis",
|
|
"heuristiek",
|
|
"historisch overzicht",
|
|
"keuzeopleidingsonderdelen",
|
|
"sociale wetenschappen",
|
|
]):
|
|
subsection = txt
|
|
if pillar is None and ("chronologische pijler" in t or "thematische pijler" in t):
|
|
pillar = txt
|
|
|
|
if section and (subsection or pillar):
|
|
# Good enough
|
|
break
|
|
|
|
return section, subsection, pillar
|
|
|
|
|
|
def parse_courses_from_page(soup: BeautifulSoup) -> pd.DataFrame:
|
|
"""
|
|
Parse all course links for the 2025-xxxx academic year, infer metadata from nearby text,
|
|
and return a normalized DataFrame.
|
|
"""
|
|
rows = []
|
|
|
|
# Capture all anchors that look like course links containing '?id=2025-<CODE>'
|
|
for a in soup.find_all("a", href=True):
|
|
href = a["href"]
|
|
# Normalize relative links
|
|
full_url = urllib.parse.urljoin(UA_BA_URL, href)
|
|
# Filter by the 'id=2025-' parameter (2025-2026)
|
|
if "id=" in href:
|
|
q = urllib.parse.urlparse(href).query
|
|
params = urllib.parse.parse_qs(q)
|
|
ids = params.get("id", [])
|
|
if not ids:
|
|
continue
|
|
# Some pages use '2025-XXXXX' or '2025-XXXXX&lang=nl'
|
|
if not any(idv.startswith(TARGET_YEAR_PREFIX) for idv in ids):
|
|
continue
|
|
course_id = ids[0] # e.g., '2025-1002FLWGES'
|
|
else:
|
|
# No id=... parameter; skip
|
|
continue
|
|
|
|
# Extract code after '2025-'
|
|
code = None
|
|
m = re.match(r"2025-([A-Za-z0-9]+)", course_id)
|
|
if m:
|
|
code = m.group(1)
|
|
|
|
name = extract_text(a).strip()
|
|
if not name or not code:
|
|
continue
|
|
|
|
# Use a reasonably large ancestor block for metadata search
|
|
container = a
|
|
for _ in range(4):
|
|
if container.parent:
|
|
container = container.parent
|
|
block_text = extract_text(container)
|
|
|
|
semester, credits, lecturers, notes = parse_meta_from_block(block_text)
|
|
section, subsection, pillar = nearest_sections(a)
|
|
|
|
rows.append({
|
|
"Section": section,
|
|
"Subsection": subsection,
|
|
"Pillar": pillar,
|
|
"Course Code": code,
|
|
"Course Name": name,
|
|
"URL": full_url,
|
|
"Semester": semester,
|
|
"Credits": credits,
|
|
"Lecturers": lecturers,
|
|
"Notes": notes
|
|
})
|
|
|
|
df = pd.DataFrame(rows).drop_duplicates(subset=["Course Code", "Course Name"])
|
|
# Keep only rows that clearly belong to the 'Bachelor' page; sometimes cross-links appear
|
|
# Heuristic: we keep rows with a Section that starts with "Modeltraject deel" or that have a Pillar marker
|
|
mask = (
|
|
df["Section"].fillna("").str.contains(r"Modeltraject deel", case=False) |
|
|
df["Pillar"].fillna("").str.contains(r"Pijler", case=False)
|
|
)
|
|
df = df[mask].copy()
|
|
|
|
# Clean up text for consistency
|
|
def clean_col(s):
|
|
return s.str.replace(r"\s+", " ", regex=True).str.strip()
|
|
|
|
for col in ["Section", "Subsection", "Pillar", "Course Name", "Lecturers", "Notes"]:
|
|
df[col] = clean_col(df[col].astype(str))
|
|
|
|
# Ensure missing pillar/subsection are empty strings for consistent sorting
|
|
df["Pillar"] = df["Pillar"].fillna("")
|
|
df["Subsection"] = df["Subsection"].fillna("")
|
|
|
|
# Sort for readability: section → pillar → subsection → name
|
|
df.sort_values(
|
|
by=["Section", "Pillar", "Subsection", "Course Name"],
|
|
inplace=True
|
|
)
|
|
df.reset_index(drop=True, inplace=True)
|
|
return df
|
|
|
|
|
|
def archive_and_write(excel_path: str, df: pd.DataFrame, target_sheet: str):
|
|
"""
|
|
- If sheet 'BAGES' exists, rename it to 'BAGES_OLD_YYYYMMDD'
|
|
- Write df to 'BAGES'
|
|
"""
|
|
try:
|
|
wb = load_workbook(excel_path)
|
|
except FileNotFoundError:
|
|
print(f"[INFO] File not found, creating new workbook: {excel_path}")
|
|
# Write a new file straight away
|
|
with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
|
|
df.to_excel(writer, sheet_name=target_sheet, index=False)
|
|
return
|
|
except InvalidFileException:
|
|
print(f"[ERROR] Not a valid Excel file: {excel_path}")
|
|
sys.exit(1)
|
|
|
|
# Rename existing BAGES to archive
|
|
if target_sheet in wb.sheetnames:
|
|
date_suffix = datetime.now().strftime("%Y%m%d")
|
|
archive_name = ARCHIVE_PREFIX + date_suffix
|
|
# Ensure uniqueness (append a counter if necessary)
|
|
counter = 1
|
|
final_archive = archive_name
|
|
while final_archive in wb.sheetnames:
|
|
counter += 1
|
|
final_archive = f"{archive_name}_{counter}"
|
|
ws = wb[target_sheet]
|
|
ws.title = final_archive
|
|
print(f"[INFO] Archived existing '{target_sheet}' as '{final_archive}'")
|
|
|
|
# Save intermediate
|
|
wb.save(excel_path)
|
|
|
|
# Now write the new sheet
|
|
with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
|
|
df.to_excel(writer, sheet_name=target_sheet, index=False)
|
|
|
|
print(f"[SUCCESS] Wrote updated '{target_sheet}' sheet to '{excel_path}'")
|
|
# Also save the updated DataFrame to a separate new Excel file for convenience
|
|
try:
|
|
src = Path(excel_path)
|
|
new_name = src.with_name(f"{src.stem}_updated{src.suffix}")
|
|
# Write a fresh workbook containing only the updated sheet
|
|
with pd.ExcelWriter(str(new_name), engine="openpyxl") as writer:
|
|
df.to_excel(writer, sheet_name=target_sheet, index=False)
|
|
print(f"[INFO] Also wrote updated data to new file '{new_name}'")
|
|
except Exception as e:
|
|
print(f"[WARNING] Could not write updated copy to new file: {e}")
|
|
|
|
|
|
def main():
|
|
print("[STEP 1] Fetching the bachelor study programme page (2025-2026)…")
|
|
soup = fetch_html(UA_BA_URL)
|
|
time.sleep(0.3)
|
|
|
|
print("[STEP 2] Parsing courses and metadata (this may take a few seconds)…")
|
|
df = parse_courses_from_page(soup)
|
|
if df.empty:
|
|
print("[WARNING] No 2025-xxxx course rows found. The page structure may have changed.")
|
|
print(" Please open the URL in a browser and check if '2025-2026' content is visible.")
|
|
else:
|
|
# Sanity: flag pillar rows (deel 3) visibly
|
|
df["Pillar"] = df["Pillar"].replace({"": None})
|
|
print(f"[INFO] Parsed {len(df)} course rows for 2025-2026.")
|
|
|
|
# Optional: give you a quick view in console
|
|
head = df.head(10).to_string(index=False)
|
|
print("[PREVIEW]\n" + head)
|
|
|
|
print(f"[STEP 3] Updating Excel: {EXCEL_PATH}")
|
|
archive_and_write(EXCEL_PATH, df, TARGET_SHEET)
|
|
|
|
print("\nDone. You can now open the workbook and review the refreshed 'BAGES' sheet.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
# Try to save a copy of the corrected script beside the original.
|
|
def save_copy(dest_name: str = "script_fixed.py"):
|
|
try:
|
|
import pathlib
|
|
src = pathlib.Path(__file__)
|
|
dst = src.with_name(dest_name)
|
|
dst.write_text(src.read_text(encoding="utf-8"), encoding="utf-8")
|
|
print(f"[INFO] Wrote a copy of this script to '{dst}'")
|
|
except Exception as e:
|
|
print(f"[WARNING] Could not write copy: {e}")
|
|
|
|
save_copy()
|