ows-master/ongeloofelijken tool/script_fixed.py

327 lines
12 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Update 'BAGES' sheet in 'ongeloofelijken tool.xlsx' with the latest (2025-2026) bachelor History
study programme from UAntwerpen. It scrapes the official page and writes a normalized table.
Source page (2025-2026 bachelor study programme):
https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/
- In 2025-2026 the 'Geschiedenis per periode en gebied' structure changed to a two-pillar model:
* Chronologische pijler: 3 OOs (middeleeuwen, nieuwe tijd, nieuwste tijd)
* Thematische pijler: 2 OOs
(See faculty helpdesk note with change summary and transition measures.)
IMPORTANT:
- This script only updates the 'BAGES' (Bachelor) sheet, because the provided link covers the bachelor page.
- 'MAGES' and 'SPVP' sheets remain untouched.
Tested with: requests, beautifulsoup4, lxml, pandas, openpyxl
"""
import re
import sys
import time
import urllib.parse
from datetime import datetime
from pathlib import Path
from typing import Optional, Tuple
import requests
import pandas as pd
from bs4 import BeautifulSoup
from openpyxl import load_workbook
from openpyxl.utils.exceptions import InvalidFileException
# ------------------------- Configuration -------------------------
EXCEL_PATH = "ongeloofelijken tool.xlsx"
TARGET_SHEET = "BAGES"
ARCHIVE_PREFIX = "BAGES_OLD_"
UA_BA_URL = "https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/"
TARGET_YEAR_PREFIX = "2025-" # Anchor/course URLs have '?id=<year>-<code>'; we filter with '2025-'
TIMEOUT = 30
HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; uantwerpen-bages-updater/1.0; +https://www.uantwerpen.be/)",
"Accept-Language": "nl,en;q=0.8"
}
# ------------------------- Helpers -------------------------
def fetch_html(url: str) -> BeautifulSoup:
resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
resp.raise_for_status()
return BeautifulSoup(resp.text, "lxml")
def extract_text(el) -> str:
return re.sub(r"\s+", " ", " ".join(el.stripped_strings)) if el else ""
def parse_meta_from_block(block_text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
"""
Try to parse Semester, Credits, Lecturers, Notes from a block of text next to a course link.
Returns (semester, credits, lecturers, notes)
"""
text = block_text
# Semester examples: '1E SEM', '2E SEM', '1E/2E SEM'
sem = None
m_sem = re.search(r"\b(1E\s*/\s*2E\s*SEM|1E\s*SEM|2E\s*SEM)\b", text, flags=re.I)
if m_sem:
sem = m_sem.group(1).upper().replace(" ", "")
# Credits examples: '6 studiepunten', '3 studiepunten'
credits = None
m_sp = re.search(r"(\d+)\s*studiepunten", text, flags=re.I)
if m_sp:
credits = m_sp.group(1)
# Lecturers: after 'Lesgever (s):' or 'Lesgever(s):'
lecturers = None
m_lect = re.search(r"Lesgever\s*\(s\)\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
if not m_lect:
m_lect = re.search(r"Lesgever[s]?\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
if m_lect:
lecturers = m_lect.group(1).strip(" .").replace(" ,", ",")
# Notes: look for two-yearly etc.
notes = None
if re.search(r"Tweejaarlijks", text, flags=re.I):
# Try to capture the "even/oneven" phrasing
m_ev = re.search(r"tweejaarlijks[^.]*?(even|oneven)[^.]*jaar", text, flags=re.I)
notes = "Tweejaarlijks" + (f" ({m_ev.group(1).lower()} jaar)" if m_ev else "")
return sem, credits, lecturers, notes
def nearest_sections(a_tag) -> Tuple[Optional[str], Optional[str], Optional[str]]:
"""
Find nearest preceding headings to classify the row.
Returns (section, subsection, pillar)
- section: e.g., 'Modeltraject deel 1/2/3'
- subsection: e.g., 'Wijsbegeerte en sociale wetenschappen', 'Keuzeopleidingsonderdelen', 'Inleiding tot de geschiedenis', etc.
- pillar: for deel 3: 'Chronologische pijler', 'Thematische pijler' or None
"""
# The site uses a variety of headings (h2, h3, h4); we trace back to find labels
h = a_tag.find_previous(["h2", "h3", "h4", "h5"])
section = subsection = pillar = None
# Walk up multiple previous headings to capture a hierarchy
prev_heads = []
cur = a_tag
for _ in range(40): # limit walk to avoid infinite loops
cur = cur.find_previous(["h2", "h3", "h4", "h5"])
if not cur:
break
txt = extract_text(cur)
prev_heads.append(txt)
# Determine labels from the nearest few headings
for txt in prev_heads:
t = txt.lower()
if section is None and "modeltraject deel" in t:
# Normalize like "Modeltraject deel 1"
section = txt
if subsection is None:
# Typical subsections
if any(k in t for k in [
"wijsbegeerte en sociale wetenschappen",
"methodologie van de geschiedenis",
"historische oefeningen",
"inleiding tot de geschiedenis",
"heuristiek",
"historisch overzicht",
"keuzeopleidingsonderdelen",
"sociale wetenschappen",
]):
subsection = txt
if pillar is None and ("chronologische pijler" in t or "thematische pijler" in t):
pillar = txt
if section and (subsection or pillar):
# Good enough
break
return section, subsection, pillar
def parse_courses_from_page(soup: BeautifulSoup) -> pd.DataFrame:
"""
Parse all course links for the 2025-xxxx academic year, infer metadata from nearby text,
and return a normalized DataFrame.
"""
rows = []
# Capture all anchors that look like course links containing '?id=2025-<CODE>'
for a in soup.find_all("a", href=True):
href = a["href"]
# Normalize relative links
full_url = urllib.parse.urljoin(UA_BA_URL, href)
# Filter by the 'id=2025-' parameter (2025-2026)
if "id=" in href:
q = urllib.parse.urlparse(href).query
params = urllib.parse.parse_qs(q)
ids = params.get("id", [])
if not ids:
continue
# Some pages use '2025-XXXXX' or '2025-XXXXX&lang=nl'
if not any(idv.startswith(TARGET_YEAR_PREFIX) for idv in ids):
continue
course_id = ids[0] # e.g., '2025-1002FLWGES'
else:
# No id=... parameter; skip
continue
# Extract code after '2025-'
code = None
m = re.match(r"2025-([A-Za-z0-9]+)", course_id)
if m:
code = m.group(1)
name = extract_text(a).strip()
if not name or not code:
continue
# Use a reasonably large ancestor block for metadata search
container = a
for _ in range(4):
if container.parent:
container = container.parent
block_text = extract_text(container)
semester, credits, lecturers, notes = parse_meta_from_block(block_text)
section, subsection, pillar = nearest_sections(a)
rows.append({
"Section": section,
"Subsection": subsection,
"Pillar": pillar,
"Course Code": code,
"Course Name": name,
"URL": full_url,
"Semester": semester,
"Credits": credits,
"Lecturers": lecturers,
"Notes": notes
})
df = pd.DataFrame(rows).drop_duplicates(subset=["Course Code", "Course Name"])
# Keep only rows that clearly belong to the 'Bachelor' page; sometimes cross-links appear
# Heuristic: we keep rows with a Section that starts with "Modeltraject deel" or that have a Pillar marker
mask = (
df["Section"].fillna("").str.contains(r"Modeltraject deel", case=False) |
df["Pillar"].fillna("").str.contains(r"Pijler", case=False)
)
df = df[mask].copy()
# Clean up text for consistency
def clean_col(s):
return s.str.replace(r"\s+", " ", regex=True).str.strip()
for col in ["Section", "Subsection", "Pillar", "Course Name", "Lecturers", "Notes"]:
df[col] = clean_col(df[col].astype(str))
# Ensure missing pillar/subsection are empty strings for consistent sorting
df["Pillar"] = df["Pillar"].fillna("")
df["Subsection"] = df["Subsection"].fillna("")
# Sort for readability: section → pillar → subsection → name
df.sort_values(
by=["Section", "Pillar", "Subsection", "Course Name"],
inplace=True
)
df.reset_index(drop=True, inplace=True)
return df
def archive_and_write(excel_path: str, df: pd.DataFrame, target_sheet: str):
"""
- If sheet 'BAGES' exists, rename it to 'BAGES_OLD_YYYYMMDD'
- Write df to 'BAGES'
"""
try:
wb = load_workbook(excel_path)
except FileNotFoundError:
print(f"[INFO] File not found, creating new workbook: {excel_path}")
# Write a new file straight away
with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
df.to_excel(writer, sheet_name=target_sheet, index=False)
return
except InvalidFileException:
print(f"[ERROR] Not a valid Excel file: {excel_path}")
sys.exit(1)
# Rename existing BAGES to archive
if target_sheet in wb.sheetnames:
date_suffix = datetime.now().strftime("%Y%m%d")
archive_name = ARCHIVE_PREFIX + date_suffix
# Ensure uniqueness (append a counter if necessary)
counter = 1
final_archive = archive_name
while final_archive in wb.sheetnames:
counter += 1
final_archive = f"{archive_name}_{counter}"
ws = wb[target_sheet]
ws.title = final_archive
print(f"[INFO] Archived existing '{target_sheet}' as '{final_archive}'")
# Save intermediate
wb.save(excel_path)
# Now write the new sheet
with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
df.to_excel(writer, sheet_name=target_sheet, index=False)
print(f"[SUCCESS] Wrote updated '{target_sheet}' sheet to '{excel_path}'")
# Also save the updated DataFrame to a separate new Excel file for convenience
try:
src = Path(excel_path)
new_name = src.with_name(f"{src.stem}_updated{src.suffix}")
# Write a fresh workbook containing only the updated sheet
with pd.ExcelWriter(str(new_name), engine="openpyxl") as writer:
df.to_excel(writer, sheet_name=target_sheet, index=False)
print(f"[INFO] Also wrote updated data to new file '{new_name}'")
except Exception as e:
print(f"[WARNING] Could not write updated copy to new file: {e}")
def main():
print("[STEP 1] Fetching the bachelor study programme page (2025-2026)…")
soup = fetch_html(UA_BA_URL)
time.sleep(0.3)
print("[STEP 2] Parsing courses and metadata (this may take a few seconds)…")
df = parse_courses_from_page(soup)
if df.empty:
print("[WARNING] No 2025-xxxx course rows found. The page structure may have changed.")
print(" Please open the URL in a browser and check if '2025-2026' content is visible.")
else:
# Sanity: flag pillar rows (deel 3) visibly
df["Pillar"] = df["Pillar"].replace({"": None})
print(f"[INFO] Parsed {len(df)} course rows for 2025-2026.")
# Optional: give you a quick view in console
head = df.head(10).to_string(index=False)
print("[PREVIEW]\n" + head)
print(f"[STEP 3] Updating Excel: {EXCEL_PATH}")
archive_and_write(EXCEL_PATH, df, TARGET_SHEET)
print("\nDone. You can now open the workbook and review the refreshed 'BAGES' sheet.")
if __name__ == "__main__":
main()
# Try to save a copy of the corrected script beside the original.
def save_copy(dest_name: str = "script_fixed.py"):
try:
import pathlib
src = pathlib.Path(__file__)
dst = src.with_name(dest_name)
dst.write_text(src.read_text(encoding="utf-8"), encoding="utf-8")
print(f"[INFO] Wrote a copy of this script to '{dst}'")
except Exception as e:
print(f"[WARNING] Could not write copy: {e}")
save_copy()