Refactor code structure and remove redundant code blocks
This commit is contained in:
326
ongeloofelijken tool/script.py
Normal file
326
ongeloofelijken tool/script.py
Normal file
@@ -0,0 +1,326 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Update 'BAGES' sheet in 'ongeloofelijken tool.xlsx' with the latest (2025-2026) bachelor History
|
||||
study programme from UAntwerpen. It scrapes the official page and writes a normalized table.
|
||||
|
||||
Source page (2025-2026 bachelor study programme):
|
||||
https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/
|
||||
- In 2025-2026 the 'Geschiedenis per periode en gebied' structure changed to a two-pillar model:
|
||||
* Chronologische pijler: 3 OOs (middeleeuwen, nieuwe tijd, nieuwste tijd)
|
||||
* Thematische pijler: 2 OOs
|
||||
(See faculty helpdesk note with change summary and transition measures.)
|
||||
|
||||
IMPORTANT:
|
||||
- This script only updates the 'BAGES' (Bachelor) sheet, because the provided link covers the bachelor page.
|
||||
- 'MAGES' and 'SPVP' sheets remain untouched.
|
||||
|
||||
Tested with: requests, beautifulsoup4, lxml, pandas, openpyxl
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import requests
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
from openpyxl import load_workbook
|
||||
from openpyxl.utils.exceptions import InvalidFileException
|
||||
|
||||
|
||||
# ------------------------- Configuration -------------------------
|
||||
EXCEL_PATH = "ongeloofelijken tool.xlsx"
|
||||
TARGET_SHEET = "BAGES"
|
||||
ARCHIVE_PREFIX = "BAGES_OLD_"
|
||||
UA_BA_URL = "https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/"
|
||||
TARGET_YEAR_PREFIX = "2025-" # Anchor/course URLs have '?id=<year>-<code>'; we filter with '2025-'
|
||||
TIMEOUT = 30
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; uantwerpen-bages-updater/1.0; +https://www.uantwerpen.be/)",
|
||||
"Accept-Language": "nl,en;q=0.8"
|
||||
}
|
||||
|
||||
|
||||
# ------------------------- Helpers -------------------------
|
||||
def fetch_html(url: str) -> BeautifulSoup:
|
||||
resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
return BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
|
||||
def extract_text(el) -> str:
|
||||
return re.sub(r"\s+", " ", " ".join(el.stripped_strings)) if el else ""
|
||||
|
||||
|
||||
def parse_meta_from_block(block_text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
|
||||
"""
|
||||
Try to parse Semester, Credits, Lecturers, Notes from a block of text next to a course link.
|
||||
Returns (semester, credits, lecturers, notes)
|
||||
"""
|
||||
text = block_text
|
||||
|
||||
# Semester examples: '1E SEM', '2E SEM', '1E/2E SEM'
|
||||
sem = None
|
||||
m_sem = re.search(r"\b(1E\s*/\s*2E\s*SEM|1E\s*SEM|2E\s*SEM)\b", text, flags=re.I)
|
||||
if m_sem:
|
||||
sem = m_sem.group(1).upper().replace(" ", "")
|
||||
|
||||
# Credits examples: '6 studiepunten', '3 studiepunten'
|
||||
credits = None
|
||||
m_sp = re.search(r"(\d+)\s*studiepunten", text, flags=re.I)
|
||||
if m_sp:
|
||||
credits = m_sp.group(1)
|
||||
|
||||
# Lecturers: after 'Lesgever (s):' or 'Lesgever(s):'
|
||||
lecturers = None
|
||||
m_lect = re.search(r"Lesgever\s*\(s\)\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
|
||||
if not m_lect:
|
||||
m_lect = re.search(r"Lesgever[s]?\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
|
||||
if m_lect:
|
||||
lecturers = m_lect.group(1).strip(" .").replace(" ,", ",")
|
||||
|
||||
# Notes: look for two-yearly etc.
|
||||
notes = None
|
||||
if re.search(r"Tweejaarlijks", text, flags=re.I):
|
||||
# Try to capture the "even/oneven" phrasing
|
||||
m_ev = re.search(r"tweejaarlijks[^.]*?(even|oneven)[^.]*jaar", text, flags=re.I)
|
||||
notes = "Tweejaarlijks" + (f" ({m_ev.group(1).lower()} jaar)" if m_ev else "")
|
||||
|
||||
return sem, credits, lecturers, notes
|
||||
|
||||
|
||||
def nearest_sections(a_tag) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
||||
"""
|
||||
Find nearest preceding headings to classify the row.
|
||||
Returns (section, subsection, pillar)
|
||||
- section: e.g., 'Modeltraject deel 1/2/3'
|
||||
- subsection: e.g., 'Wijsbegeerte en sociale wetenschappen', 'Keuzeopleidingsonderdelen', 'Inleiding tot de geschiedenis', etc.
|
||||
- pillar: for deel 3: 'Chronologische pijler', 'Thematische pijler' or None
|
||||
"""
|
||||
# The site uses a variety of headings (h2, h3, h4); we trace back to find labels
|
||||
h = a_tag.find_previous(["h2", "h3", "h4", "h5"])
|
||||
section = subsection = pillar = None
|
||||
|
||||
# Walk up multiple previous headings to capture a hierarchy
|
||||
prev_heads = []
|
||||
cur = a_tag
|
||||
for _ in range(40): # limit walk to avoid infinite loops
|
||||
cur = cur.find_previous(["h2", "h3", "h4", "h5"])
|
||||
if not cur:
|
||||
break
|
||||
txt = extract_text(cur)
|
||||
prev_heads.append(txt)
|
||||
|
||||
# Determine labels from the nearest few headings
|
||||
for txt in prev_heads:
|
||||
t = txt.lower()
|
||||
if section is None and "modeltraject deel" in t:
|
||||
# Normalize like "Modeltraject deel 1"
|
||||
section = txt
|
||||
if subsection is None:
|
||||
# Typical subsections
|
||||
if any(k in t for k in [
|
||||
"wijsbegeerte en sociale wetenschappen",
|
||||
"methodologie van de geschiedenis",
|
||||
"historische oefeningen",
|
||||
"inleiding tot de geschiedenis",
|
||||
"heuristiek",
|
||||
"historisch overzicht",
|
||||
"keuzeopleidingsonderdelen",
|
||||
"sociale wetenschappen",
|
||||
]):
|
||||
subsection = txt
|
||||
if pillar is None and ("chronologische pijler" in t or "thematische pijler" in t):
|
||||
pillar = txt
|
||||
|
||||
if section and (subsection or pillar):
|
||||
# Good enough
|
||||
break
|
||||
|
||||
return section, subsection, pillar
|
||||
|
||||
|
||||
def parse_courses_from_page(soup: BeautifulSoup) -> pd.DataFrame:
|
||||
"""
|
||||
Parse all course links for the 2025-xxxx academic year, infer metadata from nearby text,
|
||||
and return a normalized DataFrame.
|
||||
"""
|
||||
rows = []
|
||||
|
||||
# Capture all anchors that look like course links containing '?id=2025-<CODE>'
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
# Normalize relative links
|
||||
full_url = urllib.parse.urljoin(UA_BA_URL, href)
|
||||
# Filter by the 'id=2025-' parameter (2025-2026)
|
||||
if "id=" in href:
|
||||
q = urllib.parse.urlparse(href).query
|
||||
params = urllib.parse.parse_qs(q)
|
||||
ids = params.get("id", [])
|
||||
if not ids:
|
||||
continue
|
||||
# Some pages use '2025-XXXXX' or '2025-XXXXX&lang=nl'
|
||||
if not any(idv.startswith(TARGET_YEAR_PREFIX) for idv in ids):
|
||||
continue
|
||||
course_id = ids[0] # e.g., '2025-1002FLWGES'
|
||||
else:
|
||||
# No id=... parameter; skip
|
||||
continue
|
||||
|
||||
# Extract code after '2025-'
|
||||
code = None
|
||||
m = re.match(r"2025-([A-Za-z0-9]+)", course_id)
|
||||
if m:
|
||||
code = m.group(1)
|
||||
|
||||
name = extract_text(a).strip()
|
||||
if not name or not code:
|
||||
continue
|
||||
|
||||
# Use a reasonably large ancestor block for metadata search
|
||||
container = a
|
||||
for _ in range(4):
|
||||
if container.parent:
|
||||
container = container.parent
|
||||
block_text = extract_text(container)
|
||||
|
||||
semester, credits, lecturers, notes = parse_meta_from_block(block_text)
|
||||
section, subsection, pillar = nearest_sections(a)
|
||||
|
||||
rows.append({
|
||||
"Section": section,
|
||||
"Subsection": subsection,
|
||||
"Pillar": pillar,
|
||||
"Course Code": code,
|
||||
"Course Name": name,
|
||||
"URL": full_url,
|
||||
"Semester": semester,
|
||||
"Credits": credits,
|
||||
"Lecturers": lecturers,
|
||||
"Notes": notes
|
||||
})
|
||||
|
||||
df = pd.DataFrame(rows).drop_duplicates(subset=["Course Code", "Course Name"])
|
||||
# Keep only rows that clearly belong to the 'Bachelor' page; sometimes cross-links appear
|
||||
# Heuristic: we keep rows with a Section that starts with "Modeltraject deel" or that have a Pillar marker
|
||||
mask = (
|
||||
df["Section"].fillna("").str.contains(r"Modeltraject deel", case=False) |
|
||||
df["Pillar"].fillna("").str.contains(r"Pijler", case=False)
|
||||
)
|
||||
df = df[mask].copy()
|
||||
|
||||
# Clean up text for consistency
|
||||
def clean_col(s):
|
||||
return s.str.replace(r"\s+", " ", regex=True).str.strip()
|
||||
|
||||
for col in ["Section", "Subsection", "Pillar", "Course Name", "Lecturers", "Notes"]:
|
||||
df[col] = clean_col(df[col].astype(str))
|
||||
|
||||
# Ensure missing pillar/subsection are empty strings for consistent sorting
|
||||
df["Pillar"] = df["Pillar"].fillna("")
|
||||
df["Subsection"] = df["Subsection"].fillna("")
|
||||
|
||||
# Sort for readability: section → pillar → subsection → name
|
||||
df.sort_values(
|
||||
by=["Section", "Pillar", "Subsection", "Course Name"],
|
||||
inplace=True
|
||||
)
|
||||
df.reset_index(drop=True, inplace=True)
|
||||
return df
|
||||
|
||||
|
||||
def archive_and_write(excel_path: str, df: pd.DataFrame, target_sheet: str):
|
||||
"""
|
||||
- If sheet 'BAGES' exists, rename it to 'BAGES_OLD_YYYYMMDD'
|
||||
- Write df to 'BAGES'
|
||||
"""
|
||||
try:
|
||||
wb = load_workbook(excel_path)
|
||||
except FileNotFoundError:
|
||||
print(f"[INFO] File not found, creating new workbook: {excel_path}")
|
||||
# Write a new file straight away
|
||||
with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
|
||||
df.to_excel(writer, sheet_name=target_sheet, index=False)
|
||||
return
|
||||
except InvalidFileException:
|
||||
print(f"[ERROR] Not a valid Excel file: {excel_path}")
|
||||
sys.exit(1)
|
||||
|
||||
# Rename existing BAGES to archive
|
||||
if target_sheet in wb.sheetnames:
|
||||
date_suffix = datetime.now().strftime("%Y%m%d")
|
||||
archive_name = ARCHIVE_PREFIX + date_suffix
|
||||
# Ensure uniqueness (append a counter if necessary)
|
||||
counter = 1
|
||||
final_archive = archive_name
|
||||
while final_archive in wb.sheetnames:
|
||||
counter += 1
|
||||
final_archive = f"{archive_name}_{counter}"
|
||||
ws = wb[target_sheet]
|
||||
ws.title = final_archive
|
||||
print(f"[INFO] Archived existing '{target_sheet}' as '{final_archive}'")
|
||||
|
||||
# Save intermediate
|
||||
wb.save(excel_path)
|
||||
|
||||
# Now write the new sheet
|
||||
with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
|
||||
df.to_excel(writer, sheet_name=target_sheet, index=False)
|
||||
|
||||
print(f"[SUCCESS] Wrote updated '{target_sheet}' sheet to '{excel_path}'")
|
||||
# Also save the updated DataFrame to a separate new Excel file for convenience
|
||||
try:
|
||||
src = Path(excel_path)
|
||||
new_name = src.with_name(f"{src.stem}_updated{src.suffix}")
|
||||
# Write a fresh workbook containing only the updated sheet
|
||||
with pd.ExcelWriter(str(new_name), engine="openpyxl") as writer:
|
||||
df.to_excel(writer, sheet_name=target_sheet, index=False)
|
||||
print(f"[INFO] Also wrote updated data to new file '{new_name}'")
|
||||
except Exception as e:
|
||||
print(f"[WARNING] Could not write updated copy to new file: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
print("[STEP 1] Fetching the bachelor study programme page (2025-2026)…")
|
||||
soup = fetch_html(UA_BA_URL)
|
||||
time.sleep(0.3)
|
||||
|
||||
print("[STEP 2] Parsing courses and metadata (this may take a few seconds)…")
|
||||
df = parse_courses_from_page(soup)
|
||||
if df.empty:
|
||||
print("[WARNING] No 2025-xxxx course rows found. The page structure may have changed.")
|
||||
print(" Please open the URL in a browser and check if '2025-2026' content is visible.")
|
||||
else:
|
||||
# Sanity: flag pillar rows (deel 3) visibly
|
||||
df["Pillar"] = df["Pillar"].replace({"": None})
|
||||
print(f"[INFO] Parsed {len(df)} course rows for 2025-2026.")
|
||||
|
||||
# Optional: give you a quick view in console
|
||||
head = df.head(10).to_string(index=False)
|
||||
print("[PREVIEW]\n" + head)
|
||||
|
||||
print(f"[STEP 3] Updating Excel: {EXCEL_PATH}")
|
||||
archive_and_write(EXCEL_PATH, df, TARGET_SHEET)
|
||||
|
||||
print("\nDone. You can now open the workbook and review the refreshed 'BAGES' sheet.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
# Try to save a copy of the corrected script beside the original.
|
||||
def save_copy(dest_name: str = "script_fixed.py"):
|
||||
try:
|
||||
import pathlib
|
||||
src = pathlib.Path(__file__)
|
||||
dst = src.with_name(dest_name)
|
||||
dst.write_text(src.read_text(encoding="utf-8"), encoding="utf-8")
|
||||
print(f"[INFO] Wrote a copy of this script to '{dst}'")
|
||||
except Exception as e:
|
||||
print(f"[WARNING] Could not write copy: {e}")
|
||||
|
||||
save_copy()
|
||||
326
ongeloofelijken tool/script_fixed.py
Normal file
326
ongeloofelijken tool/script_fixed.py
Normal file
@@ -0,0 +1,326 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Update 'BAGES' sheet in 'ongeloofelijken tool.xlsx' with the latest (2025-2026) bachelor History
|
||||
study programme from UAntwerpen. It scrapes the official page and writes a normalized table.
|
||||
|
||||
Source page (2025-2026 bachelor study programme):
|
||||
https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/
|
||||
- In 2025-2026 the 'Geschiedenis per periode en gebied' structure changed to a two-pillar model:
|
||||
* Chronologische pijler: 3 OOs (middeleeuwen, nieuwe tijd, nieuwste tijd)
|
||||
* Thematische pijler: 2 OOs
|
||||
(See faculty helpdesk note with change summary and transition measures.)
|
||||
|
||||
IMPORTANT:
|
||||
- This script only updates the 'BAGES' (Bachelor) sheet, because the provided link covers the bachelor page.
|
||||
- 'MAGES' and 'SPVP' sheets remain untouched.
|
||||
|
||||
Tested with: requests, beautifulsoup4, lxml, pandas, openpyxl
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import requests
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
from openpyxl import load_workbook
|
||||
from openpyxl.utils.exceptions import InvalidFileException
|
||||
|
||||
|
||||
# ------------------------- Configuration -------------------------
|
||||
EXCEL_PATH = "ongeloofelijken tool.xlsx"
|
||||
TARGET_SHEET = "BAGES"
|
||||
ARCHIVE_PREFIX = "BAGES_OLD_"
|
||||
UA_BA_URL = "https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/"
|
||||
TARGET_YEAR_PREFIX = "2025-" # Anchor/course URLs have '?id=<year>-<code>'; we filter with '2025-'
|
||||
TIMEOUT = 30
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; uantwerpen-bages-updater/1.0; +https://www.uantwerpen.be/)",
|
||||
"Accept-Language": "nl,en;q=0.8"
|
||||
}
|
||||
|
||||
|
||||
# ------------------------- Helpers -------------------------
|
||||
def fetch_html(url: str) -> BeautifulSoup:
|
||||
resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
return BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
|
||||
def extract_text(el) -> str:
|
||||
return re.sub(r"\s+", " ", " ".join(el.stripped_strings)) if el else ""
|
||||
|
||||
|
||||
def parse_meta_from_block(block_text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
|
||||
"""
|
||||
Try to parse Semester, Credits, Lecturers, Notes from a block of text next to a course link.
|
||||
Returns (semester, credits, lecturers, notes)
|
||||
"""
|
||||
text = block_text
|
||||
|
||||
# Semester examples: '1E SEM', '2E SEM', '1E/2E SEM'
|
||||
sem = None
|
||||
m_sem = re.search(r"\b(1E\s*/\s*2E\s*SEM|1E\s*SEM|2E\s*SEM)\b", text, flags=re.I)
|
||||
if m_sem:
|
||||
sem = m_sem.group(1).upper().replace(" ", "")
|
||||
|
||||
# Credits examples: '6 studiepunten', '3 studiepunten'
|
||||
credits = None
|
||||
m_sp = re.search(r"(\d+)\s*studiepunten", text, flags=re.I)
|
||||
if m_sp:
|
||||
credits = m_sp.group(1)
|
||||
|
||||
# Lecturers: after 'Lesgever (s):' or 'Lesgever(s):'
|
||||
lecturers = None
|
||||
m_lect = re.search(r"Lesgever\s*\(s\)\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
|
||||
if not m_lect:
|
||||
m_lect = re.search(r"Lesgever[s]?\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
|
||||
if m_lect:
|
||||
lecturers = m_lect.group(1).strip(" .").replace(" ,", ",")
|
||||
|
||||
# Notes: look for two-yearly etc.
|
||||
notes = None
|
||||
if re.search(r"Tweejaarlijks", text, flags=re.I):
|
||||
# Try to capture the "even/oneven" phrasing
|
||||
m_ev = re.search(r"tweejaarlijks[^.]*?(even|oneven)[^.]*jaar", text, flags=re.I)
|
||||
notes = "Tweejaarlijks" + (f" ({m_ev.group(1).lower()} jaar)" if m_ev else "")
|
||||
|
||||
return sem, credits, lecturers, notes
|
||||
|
||||
|
||||
def nearest_sections(a_tag) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
||||
"""
|
||||
Find nearest preceding headings to classify the row.
|
||||
Returns (section, subsection, pillar)
|
||||
- section: e.g., 'Modeltraject deel 1/2/3'
|
||||
- subsection: e.g., 'Wijsbegeerte en sociale wetenschappen', 'Keuzeopleidingsonderdelen', 'Inleiding tot de geschiedenis', etc.
|
||||
- pillar: for deel 3: 'Chronologische pijler', 'Thematische pijler' or None
|
||||
"""
|
||||
# The site uses a variety of headings (h2, h3, h4); we trace back to find labels
|
||||
h = a_tag.find_previous(["h2", "h3", "h4", "h5"])
|
||||
section = subsection = pillar = None
|
||||
|
||||
# Walk up multiple previous headings to capture a hierarchy
|
||||
prev_heads = []
|
||||
cur = a_tag
|
||||
for _ in range(40): # limit walk to avoid infinite loops
|
||||
cur = cur.find_previous(["h2", "h3", "h4", "h5"])
|
||||
if not cur:
|
||||
break
|
||||
txt = extract_text(cur)
|
||||
prev_heads.append(txt)
|
||||
|
||||
# Determine labels from the nearest few headings
|
||||
for txt in prev_heads:
|
||||
t = txt.lower()
|
||||
if section is None and "modeltraject deel" in t:
|
||||
# Normalize like "Modeltraject deel 1"
|
||||
section = txt
|
||||
if subsection is None:
|
||||
# Typical subsections
|
||||
if any(k in t for k in [
|
||||
"wijsbegeerte en sociale wetenschappen",
|
||||
"methodologie van de geschiedenis",
|
||||
"historische oefeningen",
|
||||
"inleiding tot de geschiedenis",
|
||||
"heuristiek",
|
||||
"historisch overzicht",
|
||||
"keuzeopleidingsonderdelen",
|
||||
"sociale wetenschappen",
|
||||
]):
|
||||
subsection = txt
|
||||
if pillar is None and ("chronologische pijler" in t or "thematische pijler" in t):
|
||||
pillar = txt
|
||||
|
||||
if section and (subsection or pillar):
|
||||
# Good enough
|
||||
break
|
||||
|
||||
return section, subsection, pillar
|
||||
|
||||
|
||||
def parse_courses_from_page(soup: BeautifulSoup) -> pd.DataFrame:
|
||||
"""
|
||||
Parse all course links for the 2025-xxxx academic year, infer metadata from nearby text,
|
||||
and return a normalized DataFrame.
|
||||
"""
|
||||
rows = []
|
||||
|
||||
# Capture all anchors that look like course links containing '?id=2025-<CODE>'
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
# Normalize relative links
|
||||
full_url = urllib.parse.urljoin(UA_BA_URL, href)
|
||||
# Filter by the 'id=2025-' parameter (2025-2026)
|
||||
if "id=" in href:
|
||||
q = urllib.parse.urlparse(href).query
|
||||
params = urllib.parse.parse_qs(q)
|
||||
ids = params.get("id", [])
|
||||
if not ids:
|
||||
continue
|
||||
# Some pages use '2025-XXXXX' or '2025-XXXXX&lang=nl'
|
||||
if not any(idv.startswith(TARGET_YEAR_PREFIX) for idv in ids):
|
||||
continue
|
||||
course_id = ids[0] # e.g., '2025-1002FLWGES'
|
||||
else:
|
||||
# No id=... parameter; skip
|
||||
continue
|
||||
|
||||
# Extract code after '2025-'
|
||||
code = None
|
||||
m = re.match(r"2025-([A-Za-z0-9]+)", course_id)
|
||||
if m:
|
||||
code = m.group(1)
|
||||
|
||||
name = extract_text(a).strip()
|
||||
if not name or not code:
|
||||
continue
|
||||
|
||||
# Use a reasonably large ancestor block for metadata search
|
||||
container = a
|
||||
for _ in range(4):
|
||||
if container.parent:
|
||||
container = container.parent
|
||||
block_text = extract_text(container)
|
||||
|
||||
semester, credits, lecturers, notes = parse_meta_from_block(block_text)
|
||||
section, subsection, pillar = nearest_sections(a)
|
||||
|
||||
rows.append({
|
||||
"Section": section,
|
||||
"Subsection": subsection,
|
||||
"Pillar": pillar,
|
||||
"Course Code": code,
|
||||
"Course Name": name,
|
||||
"URL": full_url,
|
||||
"Semester": semester,
|
||||
"Credits": credits,
|
||||
"Lecturers": lecturers,
|
||||
"Notes": notes
|
||||
})
|
||||
|
||||
df = pd.DataFrame(rows).drop_duplicates(subset=["Course Code", "Course Name"])
|
||||
# Keep only rows that clearly belong to the 'Bachelor' page; sometimes cross-links appear
|
||||
# Heuristic: we keep rows with a Section that starts with "Modeltraject deel" or that have a Pillar marker
|
||||
mask = (
|
||||
df["Section"].fillna("").str.contains(r"Modeltraject deel", case=False) |
|
||||
df["Pillar"].fillna("").str.contains(r"Pijler", case=False)
|
||||
)
|
||||
df = df[mask].copy()
|
||||
|
||||
# Clean up text for consistency
|
||||
def clean_col(s):
|
||||
return s.str.replace(r"\s+", " ", regex=True).str.strip()
|
||||
|
||||
for col in ["Section", "Subsection", "Pillar", "Course Name", "Lecturers", "Notes"]:
|
||||
df[col] = clean_col(df[col].astype(str))
|
||||
|
||||
# Ensure missing pillar/subsection are empty strings for consistent sorting
|
||||
df["Pillar"] = df["Pillar"].fillna("")
|
||||
df["Subsection"] = df["Subsection"].fillna("")
|
||||
|
||||
# Sort for readability: section → pillar → subsection → name
|
||||
df.sort_values(
|
||||
by=["Section", "Pillar", "Subsection", "Course Name"],
|
||||
inplace=True
|
||||
)
|
||||
df.reset_index(drop=True, inplace=True)
|
||||
return df
|
||||
|
||||
|
||||
def archive_and_write(excel_path: str, df: pd.DataFrame, target_sheet: str):
|
||||
"""
|
||||
- If sheet 'BAGES' exists, rename it to 'BAGES_OLD_YYYYMMDD'
|
||||
- Write df to 'BAGES'
|
||||
"""
|
||||
try:
|
||||
wb = load_workbook(excel_path)
|
||||
except FileNotFoundError:
|
||||
print(f"[INFO] File not found, creating new workbook: {excel_path}")
|
||||
# Write a new file straight away
|
||||
with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
|
||||
df.to_excel(writer, sheet_name=target_sheet, index=False)
|
||||
return
|
||||
except InvalidFileException:
|
||||
print(f"[ERROR] Not a valid Excel file: {excel_path}")
|
||||
sys.exit(1)
|
||||
|
||||
# Rename existing BAGES to archive
|
||||
if target_sheet in wb.sheetnames:
|
||||
date_suffix = datetime.now().strftime("%Y%m%d")
|
||||
archive_name = ARCHIVE_PREFIX + date_suffix
|
||||
# Ensure uniqueness (append a counter if necessary)
|
||||
counter = 1
|
||||
final_archive = archive_name
|
||||
while final_archive in wb.sheetnames:
|
||||
counter += 1
|
||||
final_archive = f"{archive_name}_{counter}"
|
||||
ws = wb[target_sheet]
|
||||
ws.title = final_archive
|
||||
print(f"[INFO] Archived existing '{target_sheet}' as '{final_archive}'")
|
||||
|
||||
# Save intermediate
|
||||
wb.save(excel_path)
|
||||
|
||||
# Now write the new sheet
|
||||
with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
|
||||
df.to_excel(writer, sheet_name=target_sheet, index=False)
|
||||
|
||||
print(f"[SUCCESS] Wrote updated '{target_sheet}' sheet to '{excel_path}'")
|
||||
# Also save the updated DataFrame to a separate new Excel file for convenience
|
||||
try:
|
||||
src = Path(excel_path)
|
||||
new_name = src.with_name(f"{src.stem}_updated{src.suffix}")
|
||||
# Write a fresh workbook containing only the updated sheet
|
||||
with pd.ExcelWriter(str(new_name), engine="openpyxl") as writer:
|
||||
df.to_excel(writer, sheet_name=target_sheet, index=False)
|
||||
print(f"[INFO] Also wrote updated data to new file '{new_name}'")
|
||||
except Exception as e:
|
||||
print(f"[WARNING] Could not write updated copy to new file: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
print("[STEP 1] Fetching the bachelor study programme page (2025-2026)…")
|
||||
soup = fetch_html(UA_BA_URL)
|
||||
time.sleep(0.3)
|
||||
|
||||
print("[STEP 2] Parsing courses and metadata (this may take a few seconds)…")
|
||||
df = parse_courses_from_page(soup)
|
||||
if df.empty:
|
||||
print("[WARNING] No 2025-xxxx course rows found. The page structure may have changed.")
|
||||
print(" Please open the URL in a browser and check if '2025-2026' content is visible.")
|
||||
else:
|
||||
# Sanity: flag pillar rows (deel 3) visibly
|
||||
df["Pillar"] = df["Pillar"].replace({"": None})
|
||||
print(f"[INFO] Parsed {len(df)} course rows for 2025-2026.")
|
||||
|
||||
# Optional: give you a quick view in console
|
||||
head = df.head(10).to_string(index=False)
|
||||
print("[PREVIEW]\n" + head)
|
||||
|
||||
print(f"[STEP 3] Updating Excel: {EXCEL_PATH}")
|
||||
archive_and_write(EXCEL_PATH, df, TARGET_SHEET)
|
||||
|
||||
print("\nDone. You can now open the workbook and review the refreshed 'BAGES' sheet.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
# Try to save a copy of the corrected script beside the original.
|
||||
def save_copy(dest_name: str = "script_fixed.py"):
|
||||
try:
|
||||
import pathlib
|
||||
src = pathlib.Path(__file__)
|
||||
dst = src.with_name(dest_name)
|
||||
dst.write_text(src.read_text(encoding="utf-8"), encoding="utf-8")
|
||||
print(f"[INFO] Wrote a copy of this script to '{dst}'")
|
||||
except Exception as e:
|
||||
print(f"[WARNING] Could not write copy: {e}")
|
||||
|
||||
save_copy()
|
||||
BIN
ongeloofelijken tool/updated
Normal file
BIN
ongeloofelijken tool/updated
Normal file
Binary file not shown.
Reference in New Issue
Block a user