Refactor code structure and remove redundant code blocks
This commit is contained in:
parent
bcf8f3acae
commit
468df81386
104
check sociologie inleiding soc/script.py
Normal file
104
check sociologie inleiding soc/script.py
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def find_duplicates(base_file: Path, ps_files: list[Path], id_col_candidates=None, grade_col_candidates=None):
|
||||
"""Read the base registration file and several ps files, then find IDs that appear in both.
|
||||
|
||||
Returns a DataFrame with columns: ID, Cijfer, SourceFile
|
||||
"""
|
||||
if id_col_candidates is None:
|
||||
id_col_candidates = ["ID", "Id", "id", "inschrijving_id"]
|
||||
if grade_col_candidates is None:
|
||||
grade_col_candidates = ["Cijfer", "cijfer", "Grade", "grade"]
|
||||
|
||||
# Read base IDs
|
||||
print(f"Reading base file: {base_file}")
|
||||
base_df = pd.read_excel(base_file)
|
||||
|
||||
# find ID column in base
|
||||
base_id_col = next((c for c in base_df.columns if c in id_col_candidates), None)
|
||||
if base_id_col is None:
|
||||
raise ValueError(f"Could not find an ID column in {base_file}. Tried: {id_col_candidates}")
|
||||
|
||||
base_ids = set(base_df[base_id_col].dropna().astype(str).str.strip())
|
||||
print(f"Found {len(base_ids)} IDs in base file (column '{base_id_col}').")
|
||||
|
||||
duplicates = []
|
||||
|
||||
for pf in ps_files:
|
||||
print(f"Processing ps file: {pf}")
|
||||
try:
|
||||
df = pd.read_excel(pf)
|
||||
except Exception as e:
|
||||
print(f" Skipping {pf} - failed to read: {e}")
|
||||
continue
|
||||
|
||||
# guess ID column
|
||||
id_col = next((c for c in df.columns if c in id_col_candidates), None)
|
||||
if id_col is None:
|
||||
# try fuzzy: column name contains 'id'
|
||||
id_col = next((c for c in df.columns if 'id' in str(c).lower()), None)
|
||||
if id_col is None:
|
||||
print(f" No ID column found in {pf}; skipping.")
|
||||
continue
|
||||
|
||||
grade_col = next((c for c in df.columns if c in grade_col_candidates), None)
|
||||
if grade_col is None:
|
||||
# try fuzzy: column name contains 'cij' or 'cijfer' or 'grade'
|
||||
grade_col = next((c for c in df.columns if any(k in str(c).lower() for k in ['cij', 'grade'])), None)
|
||||
|
||||
# normalize IDs to string
|
||||
df_ids = df[[id_col]].dropna()
|
||||
df_ids[id_col] = df_ids[id_col].astype(str).str.strip()
|
||||
|
||||
# merge to find intersection
|
||||
mask = df_ids[id_col].isin(base_ids)
|
||||
matched = df.loc[mask]
|
||||
if matched.empty:
|
||||
print(f" No duplicates found in {pf}.")
|
||||
continue
|
||||
|
||||
# collect results
|
||||
for _, row in matched.iterrows():
|
||||
id_val = str(row[id_col]).strip()
|
||||
grade_val = row[grade_col] if (grade_col in row and pd.notna(row[grade_col])) else None
|
||||
duplicates.append({"ID": id_val, "Cijfer": grade_val, "SourceFile": pf.name})
|
||||
|
||||
print(f" Found {len(matched)} duplicates in {pf}.")
|
||||
|
||||
dup_df = pd.DataFrame(duplicates)
|
||||
return dup_df
|
||||
|
||||
|
||||
def main():
|
||||
base = Path(__file__).parent / "inschrijvingslijst sociologie.xlsx"
|
||||
# match files like: ps (82).xls.xlsx
|
||||
ps_files = sorted(Path(__file__).parent.glob('ps *.xls.xlsx'))
|
||||
|
||||
if not base.exists():
|
||||
print(f"Base file not found: {base}")
|
||||
return
|
||||
|
||||
if not ps_files:
|
||||
print("No ps files found matching pattern 'ps (*.xls).xlsx'")
|
||||
return
|
||||
|
||||
dup_df = find_duplicates(base, ps_files)
|
||||
|
||||
if dup_df.empty:
|
||||
print("No duplicates found across provided files.")
|
||||
else:
|
||||
# print duplicates
|
||||
print("Duplicates found (ID - Cijfer - SourceFile):")
|
||||
for _, r in dup_df.iterrows():
|
||||
print(f"{r['ID']} - {r['Cijfer']} - {r['SourceFile']}")
|
||||
|
||||
out_csv = Path(__file__).parent / 'duplicates_summary.csv'
|
||||
dup_df.to_csv(out_csv, index=False)
|
||||
print(f"Wrote summary to {out_csv}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
/Mentoraat_2024-2025.xlsx
|
||||
/reinoud.xlsx
|
||||
/sisa.xlsx
|
||||
*.xlsx
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
# Script Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
This script processes two Excel files (
|
||||
|
||||
reinoud.xlsx
|
||||
|
||||
and
|
||||
|
||||
sisa.xlsx
|
||||
|
||||
) to find and append missing IDs from
|
||||
|
||||
sisa.xlsx
|
||||
|
||||
to
|
||||
|
||||
reinoud.xlsx
|
||||
|
||||
. It also checks for duplicate IDs in
|
||||
|
||||
reinoud.xlsx
|
||||
|
||||
.
|
||||
|
||||
## Functions
|
||||
|
||||
### [`load_excel(file_path: str, sheet_name: Optional[str] = None) -> pd.DataFrame`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A7%2C%22character%22%3A4%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition")
|
||||
|
||||
Loads an Excel file into a DataFrame.
|
||||
|
||||
### [`check_duplicates(df: pd.DataFrame, column: str) -> List[str]`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A21%2C%22character%22%3A4%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition")
|
||||
|
||||
Checks for duplicate values in a specified column.
|
||||
|
||||
### [`find_missing_ids(df1: pd.DataFrame, df2: pd.DataFrame, column: str) -> List[str]`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A26%2C%22character%22%3A4%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition")
|
||||
|
||||
Finds IDs in [`df2`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A26%2C%22character%22%3A40%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition") that are not in [`df1`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A26%2C%22character%22%3A21%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition").
|
||||
|
||||
### [`append_missing_ids(reinoud_df: pd.DataFrame, sisa_df: pd.DataFrame, column: str, reinoud_file: str) -> pd.DataFrame`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A33%2C%22character%22%3A4%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition")
|
||||
|
||||
Appends missing IDs and corresponding details from [`sisa_df`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A33%2C%22character%22%3A49%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition") to [`reinoud_df`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A33%2C%22character%22%3A23%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition").
|
||||
|
||||
### [`main(reinoud_file: str, sisa_file: str, column: str, reinoud_sheet: Optional[str] = None, sisa_sheet: Optional[str] = None)`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A55%2C%22character%22%3A4%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition")
|
||||
|
||||
Main function to load the Excel files, check for duplicates, append missing IDs, and save the updated DataFrame back to the Excel file.
|
||||
|
||||
## Usage
|
||||
|
||||
Run the script with the following command:
|
||||
|
||||
```sh
|
||||
python script.py
|
||||
```
|
||||
|
||||
Example usage within the script:
|
||||
|
||||
```python
|
||||
if __name__ == "__main__":
|
||||
main('reinoud.xlsx', 'sisa.xlsx', 'Rolnummer', reinoud_sheet='Actief', sisa_sheet='sheet1')
|
||||
```
|
||||
|
||||
## Logging
|
||||
|
||||
The script uses the [`logging`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A1%2C%22character%22%3A7%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition") module to log information and errors. The log level is set to [`INFO`](command:_github.copilot.openSymbolFromReferences?%5B%22%22%2C%5B%7B%22uri%22%3A%7B%22scheme%22%3A%22file%22%2C%22authority%22%3A%22%22%2C%22path%22%3A%22%2Fc%3A%2FUsers%2Fbrech%2FDocuments%2FlocalReps%2Fows-mentoraat%2Fscript.py%22%2C%22query%22%3A%22%22%2C%22fragment%22%3A%22%22%7D%2C%22pos%22%3A%7B%22line%22%3A5%2C%22character%22%3A34%7D%7D%5D%2C%22723f84da-7613-47af-a432-459fba37ba55%22%5D "Go to definition").
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
.gitignore
|
||||
reinoud.xlsx
|
||||
script.py
|
||||
sisa.xlsx
|
||||
```
|
||||
|
||||
## Dependencies
|
||||
|
||||
- pandas
|
||||
- logging
|
||||
|
||||
Install dependencies using:
|
||||
|
||||
```sh
|
||||
pip install pandas
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
This script is provided "as-is" without any warranty. Use at your own risk.
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
import pandas as pd
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
|
||||
def load_excel(file_path: str, sheet_name: Optional[str] = None) -> pd.DataFrame:
|
||||
"""Load an Excel file into a DataFrame."""
|
||||
try:
|
||||
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
||||
if isinstance(df, dict):
|
||||
raise ValueError(f"Multiple sheets found in {file_path}. Please specify a sheet name.")
|
||||
return df
|
||||
except FileNotFoundError:
|
||||
logging.error(f"File not found: {file_path}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logging.error(f"Error loading file {file_path}: {e}")
|
||||
raise
|
||||
|
||||
def check_duplicates(df: pd.DataFrame, column: str) -> List[str]:
|
||||
"""Check for duplicate values in a specified column."""
|
||||
duplicates = df[column].astype(str)[df[column].duplicated()]
|
||||
return duplicates.tolist()
|
||||
|
||||
def find_missing_ids(df1: pd.DataFrame, df2: pd.DataFrame, column: str) -> List[str]:
|
||||
"""Find IDs in df2 that are not in df1."""
|
||||
ids1 = df1[column].astype(str)
|
||||
ids2 = df2[column].astype(str)
|
||||
missing_ids = ids2[~ids2.isin(ids1)]
|
||||
return missing_ids.tolist()
|
||||
|
||||
def append_missing_ids(reinoud_df: pd.DataFrame, sisa_df: pd.DataFrame, column: str, reinoud_file: str) -> pd.DataFrame:
|
||||
"""Append missing IDs and corresponding Naam, Voornaam, Plan, and Campus emailadres to reinoud_df."""
|
||||
missing_ids = find_missing_ids(reinoud_df, sisa_df, column)
|
||||
if missing_ids:
|
||||
missing_rows = sisa_df[sisa_df[column].astype(str).isin(missing_ids)]
|
||||
# Select only the specified columns
|
||||
selected_columns = ['Rolnummer', 'Naam', 'Voornaam', 'Plan', 'Campus emailadres']
|
||||
missing_rows = missing_rows[selected_columns]
|
||||
|
||||
# Rename 'Campus emailadres' to 'mail' for reinoud_df
|
||||
missing_rows = missing_rows.rename(columns={'Campus emailadres': 'mail'})
|
||||
|
||||
# Append missing rows to reinoud_df
|
||||
reinoud_df = pd.concat([reinoud_df, missing_rows], ignore_index=True)
|
||||
|
||||
logging.info(f"Appended missing IDs to {reinoud_file}:")
|
||||
for _, row in missing_rows.iterrows():
|
||||
logging.info(f"ID: {row[column]}, Naam: {row['Naam']}, Voornaam: {row['Voornaam']}, Plan: {row['Plan']}, mail: {row['mail']}")
|
||||
else:
|
||||
logging.info("No missing IDs to append.")
|
||||
return reinoud_df
|
||||
|
||||
def main(reinoud_file: str, sisa_file: str, column: str, reinoud_sheet: Optional[str] = None, sisa_sheet: Optional[str] = None):
|
||||
# Load the Excel files
|
||||
reinoud_df = load_excel(reinoud_file, sheet_name=reinoud_sheet)
|
||||
sisa_df = load_excel(sisa_file, sheet_name=sisa_sheet)
|
||||
|
||||
# Debug: Print columns of sisa_df
|
||||
logging.info(f"Columns in {sisa_file}: {sisa_df.columns.tolist()}")
|
||||
|
||||
# Check for duplicates in reinoud
|
||||
duplicates = check_duplicates(reinoud_df, column)
|
||||
if duplicates:
|
||||
logging.info("Duplicate IDs in reinoud.xlsx:")
|
||||
logging.info(duplicates)
|
||||
else:
|
||||
logging.info("No duplicates found in reinoud.xlsx.")
|
||||
|
||||
# Append missing IDs from sisa to reinoud
|
||||
reinoud_df = append_missing_ids(reinoud_df, sisa_df, column, reinoud_file)
|
||||
|
||||
# Save the updated reinoud_df back to the Excel file
|
||||
reinoud_df.to_excel(reinoud_file, sheet_name=reinoud_sheet, index=False)
|
||||
logging.info(f"Updated {reinoud_file} saved.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
# change the file names, column name, and sheet names as needed
|
||||
main('reinoud.xlsx', 'sisa.xlsx', 'Rolnummer', reinoud_sheet='Actief', sisa_sheet='sheet1')
|
||||
326
ongeloofelijken tool/script.py
Normal file
326
ongeloofelijken tool/script.py
Normal file
|
|
@ -0,0 +1,326 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Update 'BAGES' sheet in 'ongeloofelijken tool.xlsx' with the latest (2025-2026) bachelor History
|
||||
study programme from UAntwerpen. It scrapes the official page and writes a normalized table.
|
||||
|
||||
Source page (2025-2026 bachelor study programme):
|
||||
https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/
|
||||
- In 2025-2026 the 'Geschiedenis per periode en gebied' structure changed to a two-pillar model:
|
||||
* Chronologische pijler: 3 OOs (middeleeuwen, nieuwe tijd, nieuwste tijd)
|
||||
* Thematische pijler: 2 OOs
|
||||
(See faculty helpdesk note with change summary and transition measures.)
|
||||
|
||||
IMPORTANT:
|
||||
- This script only updates the 'BAGES' (Bachelor) sheet, because the provided link covers the bachelor page.
|
||||
- 'MAGES' and 'SPVP' sheets remain untouched.
|
||||
|
||||
Tested with: requests, beautifulsoup4, lxml, pandas, openpyxl
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import requests
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
from openpyxl import load_workbook
|
||||
from openpyxl.utils.exceptions import InvalidFileException
|
||||
|
||||
|
||||
# ------------------------- Configuration -------------------------
|
||||
EXCEL_PATH = "ongeloofelijken tool.xlsx"
|
||||
TARGET_SHEET = "BAGES"
|
||||
ARCHIVE_PREFIX = "BAGES_OLD_"
|
||||
UA_BA_URL = "https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/"
|
||||
TARGET_YEAR_PREFIX = "2025-" # Anchor/course URLs have '?id=<year>-<code>'; we filter with '2025-'
|
||||
TIMEOUT = 30
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; uantwerpen-bages-updater/1.0; +https://www.uantwerpen.be/)",
|
||||
"Accept-Language": "nl,en;q=0.8"
|
||||
}
|
||||
|
||||
|
||||
# ------------------------- Helpers -------------------------
|
||||
def fetch_html(url: str) -> BeautifulSoup:
|
||||
resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
return BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
|
||||
def extract_text(el) -> str:
|
||||
return re.sub(r"\s+", " ", " ".join(el.stripped_strings)) if el else ""
|
||||
|
||||
|
||||
def parse_meta_from_block(block_text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
|
||||
"""
|
||||
Try to parse Semester, Credits, Lecturers, Notes from a block of text next to a course link.
|
||||
Returns (semester, credits, lecturers, notes)
|
||||
"""
|
||||
text = block_text
|
||||
|
||||
# Semester examples: '1E SEM', '2E SEM', '1E/2E SEM'
|
||||
sem = None
|
||||
m_sem = re.search(r"\b(1E\s*/\s*2E\s*SEM|1E\s*SEM|2E\s*SEM)\b", text, flags=re.I)
|
||||
if m_sem:
|
||||
sem = m_sem.group(1).upper().replace(" ", "")
|
||||
|
||||
# Credits examples: '6 studiepunten', '3 studiepunten'
|
||||
credits = None
|
||||
m_sp = re.search(r"(\d+)\s*studiepunten", text, flags=re.I)
|
||||
if m_sp:
|
||||
credits = m_sp.group(1)
|
||||
|
||||
# Lecturers: after 'Lesgever (s):' or 'Lesgever(s):'
|
||||
lecturers = None
|
||||
m_lect = re.search(r"Lesgever\s*\(s\)\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
|
||||
if not m_lect:
|
||||
m_lect = re.search(r"Lesgever[s]?\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
|
||||
if m_lect:
|
||||
lecturers = m_lect.group(1).strip(" .").replace(" ,", ",")
|
||||
|
||||
# Notes: look for two-yearly etc.
|
||||
notes = None
|
||||
if re.search(r"Tweejaarlijks", text, flags=re.I):
|
||||
# Try to capture the "even/oneven" phrasing
|
||||
m_ev = re.search(r"tweejaarlijks[^.]*?(even|oneven)[^.]*jaar", text, flags=re.I)
|
||||
notes = "Tweejaarlijks" + (f" ({m_ev.group(1).lower()} jaar)" if m_ev else "")
|
||||
|
||||
return sem, credits, lecturers, notes
|
||||
|
||||
|
||||
def nearest_sections(a_tag) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
||||
"""
|
||||
Find nearest preceding headings to classify the row.
|
||||
Returns (section, subsection, pillar)
|
||||
- section: e.g., 'Modeltraject deel 1/2/3'
|
||||
- subsection: e.g., 'Wijsbegeerte en sociale wetenschappen', 'Keuzeopleidingsonderdelen', 'Inleiding tot de geschiedenis', etc.
|
||||
- pillar: for deel 3: 'Chronologische pijler', 'Thematische pijler' or None
|
||||
"""
|
||||
# The site uses a variety of headings (h2, h3, h4); we trace back to find labels
|
||||
h = a_tag.find_previous(["h2", "h3", "h4", "h5"])
|
||||
section = subsection = pillar = None
|
||||
|
||||
# Walk up multiple previous headings to capture a hierarchy
|
||||
prev_heads = []
|
||||
cur = a_tag
|
||||
for _ in range(40): # limit walk to avoid infinite loops
|
||||
cur = cur.find_previous(["h2", "h3", "h4", "h5"])
|
||||
if not cur:
|
||||
break
|
||||
txt = extract_text(cur)
|
||||
prev_heads.append(txt)
|
||||
|
||||
# Determine labels from the nearest few headings
|
||||
for txt in prev_heads:
|
||||
t = txt.lower()
|
||||
if section is None and "modeltraject deel" in t:
|
||||
# Normalize like "Modeltraject deel 1"
|
||||
section = txt
|
||||
if subsection is None:
|
||||
# Typical subsections
|
||||
if any(k in t for k in [
|
||||
"wijsbegeerte en sociale wetenschappen",
|
||||
"methodologie van de geschiedenis",
|
||||
"historische oefeningen",
|
||||
"inleiding tot de geschiedenis",
|
||||
"heuristiek",
|
||||
"historisch overzicht",
|
||||
"keuzeopleidingsonderdelen",
|
||||
"sociale wetenschappen",
|
||||
]):
|
||||
subsection = txt
|
||||
if pillar is None and ("chronologische pijler" in t or "thematische pijler" in t):
|
||||
pillar = txt
|
||||
|
||||
if section and (subsection or pillar):
|
||||
# Good enough
|
||||
break
|
||||
|
||||
return section, subsection, pillar
|
||||
|
||||
|
||||
def parse_courses_from_page(soup: BeautifulSoup) -> pd.DataFrame:
|
||||
"""
|
||||
Parse all course links for the 2025-xxxx academic year, infer metadata from nearby text,
|
||||
and return a normalized DataFrame.
|
||||
"""
|
||||
rows = []
|
||||
|
||||
# Capture all anchors that look like course links containing '?id=2025-<CODE>'
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
# Normalize relative links
|
||||
full_url = urllib.parse.urljoin(UA_BA_URL, href)
|
||||
# Filter by the 'id=2025-' parameter (2025-2026)
|
||||
if "id=" in href:
|
||||
q = urllib.parse.urlparse(href).query
|
||||
params = urllib.parse.parse_qs(q)
|
||||
ids = params.get("id", [])
|
||||
if not ids:
|
||||
continue
|
||||
# Some pages use '2025-XXXXX' or '2025-XXXXX&lang=nl'
|
||||
if not any(idv.startswith(TARGET_YEAR_PREFIX) for idv in ids):
|
||||
continue
|
||||
course_id = ids[0] # e.g., '2025-1002FLWGES'
|
||||
else:
|
||||
# No id=... parameter; skip
|
||||
continue
|
||||
|
||||
# Extract code after '2025-'
|
||||
code = None
|
||||
m = re.match(r"2025-([A-Za-z0-9]+)", course_id)
|
||||
if m:
|
||||
code = m.group(1)
|
||||
|
||||
name = extract_text(a).strip()
|
||||
if not name or not code:
|
||||
continue
|
||||
|
||||
# Use a reasonably large ancestor block for metadata search
|
||||
container = a
|
||||
for _ in range(4):
|
||||
if container.parent:
|
||||
container = container.parent
|
||||
block_text = extract_text(container)
|
||||
|
||||
semester, credits, lecturers, notes = parse_meta_from_block(block_text)
|
||||
section, subsection, pillar = nearest_sections(a)
|
||||
|
||||
rows.append({
|
||||
"Section": section,
|
||||
"Subsection": subsection,
|
||||
"Pillar": pillar,
|
||||
"Course Code": code,
|
||||
"Course Name": name,
|
||||
"URL": full_url,
|
||||
"Semester": semester,
|
||||
"Credits": credits,
|
||||
"Lecturers": lecturers,
|
||||
"Notes": notes
|
||||
})
|
||||
|
||||
df = pd.DataFrame(rows).drop_duplicates(subset=["Course Code", "Course Name"])
|
||||
# Keep only rows that clearly belong to the 'Bachelor' page; sometimes cross-links appear
|
||||
# Heuristic: we keep rows with a Section that starts with "Modeltraject deel" or that have a Pillar marker
|
||||
mask = (
|
||||
df["Section"].fillna("").str.contains(r"Modeltraject deel", case=False) |
|
||||
df["Pillar"].fillna("").str.contains(r"Pijler", case=False)
|
||||
)
|
||||
df = df[mask].copy()
|
||||
|
||||
# Clean up text for consistency
|
||||
def clean_col(s):
|
||||
return s.str.replace(r"\s+", " ", regex=True).str.strip()
|
||||
|
||||
for col in ["Section", "Subsection", "Pillar", "Course Name", "Lecturers", "Notes"]:
|
||||
df[col] = clean_col(df[col].astype(str))
|
||||
|
||||
# Ensure missing pillar/subsection are empty strings for consistent sorting
|
||||
df["Pillar"] = df["Pillar"].fillna("")
|
||||
df["Subsection"] = df["Subsection"].fillna("")
|
||||
|
||||
# Sort for readability: section → pillar → subsection → name
|
||||
df.sort_values(
|
||||
by=["Section", "Pillar", "Subsection", "Course Name"],
|
||||
inplace=True
|
||||
)
|
||||
df.reset_index(drop=True, inplace=True)
|
||||
return df
|
||||
|
||||
|
||||
def archive_and_write(excel_path: str, df: pd.DataFrame, target_sheet: str):
|
||||
"""
|
||||
- If sheet 'BAGES' exists, rename it to 'BAGES_OLD_YYYYMMDD'
|
||||
- Write df to 'BAGES'
|
||||
"""
|
||||
try:
|
||||
wb = load_workbook(excel_path)
|
||||
except FileNotFoundError:
|
||||
print(f"[INFO] File not found, creating new workbook: {excel_path}")
|
||||
# Write a new file straight away
|
||||
with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
|
||||
df.to_excel(writer, sheet_name=target_sheet, index=False)
|
||||
return
|
||||
except InvalidFileException:
|
||||
print(f"[ERROR] Not a valid Excel file: {excel_path}")
|
||||
sys.exit(1)
|
||||
|
||||
# Rename existing BAGES to archive
|
||||
if target_sheet in wb.sheetnames:
|
||||
date_suffix = datetime.now().strftime("%Y%m%d")
|
||||
archive_name = ARCHIVE_PREFIX + date_suffix
|
||||
# Ensure uniqueness (append a counter if necessary)
|
||||
counter = 1
|
||||
final_archive = archive_name
|
||||
while final_archive in wb.sheetnames:
|
||||
counter += 1
|
||||
final_archive = f"{archive_name}_{counter}"
|
||||
ws = wb[target_sheet]
|
||||
ws.title = final_archive
|
||||
print(f"[INFO] Archived existing '{target_sheet}' as '{final_archive}'")
|
||||
|
||||
# Save intermediate
|
||||
wb.save(excel_path)
|
||||
|
||||
# Now write the new sheet
|
||||
with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
|
||||
df.to_excel(writer, sheet_name=target_sheet, index=False)
|
||||
|
||||
print(f"[SUCCESS] Wrote updated '{target_sheet}' sheet to '{excel_path}'")
|
||||
# Also save the updated DataFrame to a separate new Excel file for convenience
|
||||
try:
|
||||
src = Path(excel_path)
|
||||
new_name = src.with_name(f"{src.stem}_updated{src.suffix}")
|
||||
# Write a fresh workbook containing only the updated sheet
|
||||
with pd.ExcelWriter(str(new_name), engine="openpyxl") as writer:
|
||||
df.to_excel(writer, sheet_name=target_sheet, index=False)
|
||||
print(f"[INFO] Also wrote updated data to new file '{new_name}'")
|
||||
except Exception as e:
|
||||
print(f"[WARNING] Could not write updated copy to new file: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
print("[STEP 1] Fetching the bachelor study programme page (2025-2026)…")
|
||||
soup = fetch_html(UA_BA_URL)
|
||||
time.sleep(0.3)
|
||||
|
||||
print("[STEP 2] Parsing courses and metadata (this may take a few seconds)…")
|
||||
df = parse_courses_from_page(soup)
|
||||
if df.empty:
|
||||
print("[WARNING] No 2025-xxxx course rows found. The page structure may have changed.")
|
||||
print(" Please open the URL in a browser and check if '2025-2026' content is visible.")
|
||||
else:
|
||||
# Sanity: flag pillar rows (deel 3) visibly
|
||||
df["Pillar"] = df["Pillar"].replace({"": None})
|
||||
print(f"[INFO] Parsed {len(df)} course rows for 2025-2026.")
|
||||
|
||||
# Optional: give you a quick view in console
|
||||
head = df.head(10).to_string(index=False)
|
||||
print("[PREVIEW]\n" + head)
|
||||
|
||||
print(f"[STEP 3] Updating Excel: {EXCEL_PATH}")
|
||||
archive_and_write(EXCEL_PATH, df, TARGET_SHEET)
|
||||
|
||||
print("\nDone. You can now open the workbook and review the refreshed 'BAGES' sheet.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
# Try to save a copy of the corrected script beside the original.
|
||||
def save_copy(dest_name: str = "script_fixed.py"):
|
||||
try:
|
||||
import pathlib
|
||||
src = pathlib.Path(__file__)
|
||||
dst = src.with_name(dest_name)
|
||||
dst.write_text(src.read_text(encoding="utf-8"), encoding="utf-8")
|
||||
print(f"[INFO] Wrote a copy of this script to '{dst}'")
|
||||
except Exception as e:
|
||||
print(f"[WARNING] Could not write copy: {e}")
|
||||
|
||||
save_copy()
|
||||
326
ongeloofelijken tool/script_fixed.py
Normal file
326
ongeloofelijken tool/script_fixed.py
Normal file
|
|
@ -0,0 +1,326 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Update 'BAGES' sheet in 'ongeloofelijken tool.xlsx' with the latest (2025-2026) bachelor History
|
||||
study programme from UAntwerpen. It scrapes the official page and writes a normalized table.
|
||||
|
||||
Source page (2025-2026 bachelor study programme):
|
||||
https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/
|
||||
- In 2025-2026 the 'Geschiedenis per periode en gebied' structure changed to a two-pillar model:
|
||||
* Chronologische pijler: 3 OOs (middeleeuwen, nieuwe tijd, nieuwste tijd)
|
||||
* Thematische pijler: 2 OOs
|
||||
(See faculty helpdesk note with change summary and transition measures.)
|
||||
|
||||
IMPORTANT:
|
||||
- This script only updates the 'BAGES' (Bachelor) sheet, because the provided link covers the bachelor page.
|
||||
- 'MAGES' and 'SPVP' sheets remain untouched.
|
||||
|
||||
Tested with: requests, beautifulsoup4, lxml, pandas, openpyxl
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import requests
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
from openpyxl import load_workbook
|
||||
from openpyxl.utils.exceptions import InvalidFileException
|
||||
|
||||
|
||||
# ------------------------- Configuration -------------------------
|
||||
EXCEL_PATH = "ongeloofelijken tool.xlsx"
|
||||
TARGET_SHEET = "BAGES"
|
||||
ARCHIVE_PREFIX = "BAGES_OLD_"
|
||||
UA_BA_URL = "https://www.uantwerpen.be/nl/studeren/aanbod/alle-opleidingen/geschiedenis-studeren/bachelor/studieprogramma/"
|
||||
TARGET_YEAR_PREFIX = "2025-" # Anchor/course URLs have '?id=<year>-<code>'; we filter with '2025-'
|
||||
TIMEOUT = 30
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; uantwerpen-bages-updater/1.0; +https://www.uantwerpen.be/)",
|
||||
"Accept-Language": "nl,en;q=0.8"
|
||||
}
|
||||
|
||||
|
||||
# ------------------------- Helpers -------------------------
|
||||
def fetch_html(url: str) -> BeautifulSoup:
|
||||
resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
return BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
|
||||
def extract_text(el) -> str:
|
||||
return re.sub(r"\s+", " ", " ".join(el.stripped_strings)) if el else ""
|
||||
|
||||
|
||||
def parse_meta_from_block(block_text: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
|
||||
"""
|
||||
Try to parse Semester, Credits, Lecturers, Notes from a block of text next to a course link.
|
||||
Returns (semester, credits, lecturers, notes)
|
||||
"""
|
||||
text = block_text
|
||||
|
||||
# Semester examples: '1E SEM', '2E SEM', '1E/2E SEM'
|
||||
sem = None
|
||||
m_sem = re.search(r"\b(1E\s*/\s*2E\s*SEM|1E\s*SEM|2E\s*SEM)\b", text, flags=re.I)
|
||||
if m_sem:
|
||||
sem = m_sem.group(1).upper().replace(" ", "")
|
||||
|
||||
# Credits examples: '6 studiepunten', '3 studiepunten'
|
||||
credits = None
|
||||
m_sp = re.search(r"(\d+)\s*studiepunten", text, flags=re.I)
|
||||
if m_sp:
|
||||
credits = m_sp.group(1)
|
||||
|
||||
# Lecturers: after 'Lesgever (s):' or 'Lesgever(s):'
|
||||
lecturers = None
|
||||
m_lect = re.search(r"Lesgever\s*\(s\)\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
|
||||
if not m_lect:
|
||||
m_lect = re.search(r"Lesgever[s]?\s*:\s*([^|]+?)(?:\s{2,}|$)", text, flags=re.I)
|
||||
if m_lect:
|
||||
lecturers = m_lect.group(1).strip(" .").replace(" ,", ",")
|
||||
|
||||
# Notes: look for two-yearly etc.
|
||||
notes = None
|
||||
if re.search(r"Tweejaarlijks", text, flags=re.I):
|
||||
# Try to capture the "even/oneven" phrasing
|
||||
m_ev = re.search(r"tweejaarlijks[^.]*?(even|oneven)[^.]*jaar", text, flags=re.I)
|
||||
notes = "Tweejaarlijks" + (f" ({m_ev.group(1).lower()} jaar)" if m_ev else "")
|
||||
|
||||
return sem, credits, lecturers, notes
|
||||
|
||||
|
||||
def nearest_sections(a_tag) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
||||
"""
|
||||
Find nearest preceding headings to classify the row.
|
||||
Returns (section, subsection, pillar)
|
||||
- section: e.g., 'Modeltraject deel 1/2/3'
|
||||
- subsection: e.g., 'Wijsbegeerte en sociale wetenschappen', 'Keuzeopleidingsonderdelen', 'Inleiding tot de geschiedenis', etc.
|
||||
- pillar: for deel 3: 'Chronologische pijler', 'Thematische pijler' or None
|
||||
"""
|
||||
# The site uses a variety of headings (h2, h3, h4); we trace back to find labels
|
||||
h = a_tag.find_previous(["h2", "h3", "h4", "h5"])
|
||||
section = subsection = pillar = None
|
||||
|
||||
# Walk up multiple previous headings to capture a hierarchy
|
||||
prev_heads = []
|
||||
cur = a_tag
|
||||
for _ in range(40): # limit walk to avoid infinite loops
|
||||
cur = cur.find_previous(["h2", "h3", "h4", "h5"])
|
||||
if not cur:
|
||||
break
|
||||
txt = extract_text(cur)
|
||||
prev_heads.append(txt)
|
||||
|
||||
# Determine labels from the nearest few headings
|
||||
for txt in prev_heads:
|
||||
t = txt.lower()
|
||||
if section is None and "modeltraject deel" in t:
|
||||
# Normalize like "Modeltraject deel 1"
|
||||
section = txt
|
||||
if subsection is None:
|
||||
# Typical subsections
|
||||
if any(k in t for k in [
|
||||
"wijsbegeerte en sociale wetenschappen",
|
||||
"methodologie van de geschiedenis",
|
||||
"historische oefeningen",
|
||||
"inleiding tot de geschiedenis",
|
||||
"heuristiek",
|
||||
"historisch overzicht",
|
||||
"keuzeopleidingsonderdelen",
|
||||
"sociale wetenschappen",
|
||||
]):
|
||||
subsection = txt
|
||||
if pillar is None and ("chronologische pijler" in t or "thematische pijler" in t):
|
||||
pillar = txt
|
||||
|
||||
if section and (subsection or pillar):
|
||||
# Good enough
|
||||
break
|
||||
|
||||
return section, subsection, pillar
|
||||
|
||||
|
||||
def parse_courses_from_page(soup: BeautifulSoup) -> pd.DataFrame:
|
||||
"""
|
||||
Parse all course links for the 2025-xxxx academic year, infer metadata from nearby text,
|
||||
and return a normalized DataFrame.
|
||||
"""
|
||||
rows = []
|
||||
|
||||
# Capture all anchors that look like course links containing '?id=2025-<CODE>'
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
# Normalize relative links
|
||||
full_url = urllib.parse.urljoin(UA_BA_URL, href)
|
||||
# Filter by the 'id=2025-' parameter (2025-2026)
|
||||
if "id=" in href:
|
||||
q = urllib.parse.urlparse(href).query
|
||||
params = urllib.parse.parse_qs(q)
|
||||
ids = params.get("id", [])
|
||||
if not ids:
|
||||
continue
|
||||
# Some pages use '2025-XXXXX' or '2025-XXXXX&lang=nl'
|
||||
if not any(idv.startswith(TARGET_YEAR_PREFIX) for idv in ids):
|
||||
continue
|
||||
course_id = ids[0] # e.g., '2025-1002FLWGES'
|
||||
else:
|
||||
# No id=... parameter; skip
|
||||
continue
|
||||
|
||||
# Extract code after '2025-'
|
||||
code = None
|
||||
m = re.match(r"2025-([A-Za-z0-9]+)", course_id)
|
||||
if m:
|
||||
code = m.group(1)
|
||||
|
||||
name = extract_text(a).strip()
|
||||
if not name or not code:
|
||||
continue
|
||||
|
||||
# Use a reasonably large ancestor block for metadata search
|
||||
container = a
|
||||
for _ in range(4):
|
||||
if container.parent:
|
||||
container = container.parent
|
||||
block_text = extract_text(container)
|
||||
|
||||
semester, credits, lecturers, notes = parse_meta_from_block(block_text)
|
||||
section, subsection, pillar = nearest_sections(a)
|
||||
|
||||
rows.append({
|
||||
"Section": section,
|
||||
"Subsection": subsection,
|
||||
"Pillar": pillar,
|
||||
"Course Code": code,
|
||||
"Course Name": name,
|
||||
"URL": full_url,
|
||||
"Semester": semester,
|
||||
"Credits": credits,
|
||||
"Lecturers": lecturers,
|
||||
"Notes": notes
|
||||
})
|
||||
|
||||
df = pd.DataFrame(rows).drop_duplicates(subset=["Course Code", "Course Name"])
|
||||
# Keep only rows that clearly belong to the 'Bachelor' page; sometimes cross-links appear
|
||||
# Heuristic: we keep rows with a Section that starts with "Modeltraject deel" or that have a Pillar marker
|
||||
mask = (
|
||||
df["Section"].fillna("").str.contains(r"Modeltraject deel", case=False) |
|
||||
df["Pillar"].fillna("").str.contains(r"Pijler", case=False)
|
||||
)
|
||||
df = df[mask].copy()
|
||||
|
||||
# Clean up text for consistency
|
||||
def clean_col(s):
|
||||
return s.str.replace(r"\s+", " ", regex=True).str.strip()
|
||||
|
||||
for col in ["Section", "Subsection", "Pillar", "Course Name", "Lecturers", "Notes"]:
|
||||
df[col] = clean_col(df[col].astype(str))
|
||||
|
||||
# Ensure missing pillar/subsection are empty strings for consistent sorting
|
||||
df["Pillar"] = df["Pillar"].fillna("")
|
||||
df["Subsection"] = df["Subsection"].fillna("")
|
||||
|
||||
# Sort for readability: section → pillar → subsection → name
|
||||
df.sort_values(
|
||||
by=["Section", "Pillar", "Subsection", "Course Name"],
|
||||
inplace=True
|
||||
)
|
||||
df.reset_index(drop=True, inplace=True)
|
||||
return df
|
||||
|
||||
|
||||
def archive_and_write(excel_path: str, df: pd.DataFrame, target_sheet: str):
|
||||
"""
|
||||
- If sheet 'BAGES' exists, rename it to 'BAGES_OLD_YYYYMMDD'
|
||||
- Write df to 'BAGES'
|
||||
"""
|
||||
try:
|
||||
wb = load_workbook(excel_path)
|
||||
except FileNotFoundError:
|
||||
print(f"[INFO] File not found, creating new workbook: {excel_path}")
|
||||
# Write a new file straight away
|
||||
with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
|
||||
df.to_excel(writer, sheet_name=target_sheet, index=False)
|
||||
return
|
||||
except InvalidFileException:
|
||||
print(f"[ERROR] Not a valid Excel file: {excel_path}")
|
||||
sys.exit(1)
|
||||
|
||||
# Rename existing BAGES to archive
|
||||
if target_sheet in wb.sheetnames:
|
||||
date_suffix = datetime.now().strftime("%Y%m%d")
|
||||
archive_name = ARCHIVE_PREFIX + date_suffix
|
||||
# Ensure uniqueness (append a counter if necessary)
|
||||
counter = 1
|
||||
final_archive = archive_name
|
||||
while final_archive in wb.sheetnames:
|
||||
counter += 1
|
||||
final_archive = f"{archive_name}_{counter}"
|
||||
ws = wb[target_sheet]
|
||||
ws.title = final_archive
|
||||
print(f"[INFO] Archived existing '{target_sheet}' as '{final_archive}'")
|
||||
|
||||
# Save intermediate
|
||||
wb.save(excel_path)
|
||||
|
||||
# Now write the new sheet
|
||||
with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
|
||||
df.to_excel(writer, sheet_name=target_sheet, index=False)
|
||||
|
||||
print(f"[SUCCESS] Wrote updated '{target_sheet}' sheet to '{excel_path}'")
|
||||
# Also save the updated DataFrame to a separate new Excel file for convenience
|
||||
try:
|
||||
src = Path(excel_path)
|
||||
new_name = src.with_name(f"{src.stem}_updated{src.suffix}")
|
||||
# Write a fresh workbook containing only the updated sheet
|
||||
with pd.ExcelWriter(str(new_name), engine="openpyxl") as writer:
|
||||
df.to_excel(writer, sheet_name=target_sheet, index=False)
|
||||
print(f"[INFO] Also wrote updated data to new file '{new_name}'")
|
||||
except Exception as e:
|
||||
print(f"[WARNING] Could not write updated copy to new file: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
print("[STEP 1] Fetching the bachelor study programme page (2025-2026)…")
|
||||
soup = fetch_html(UA_BA_URL)
|
||||
time.sleep(0.3)
|
||||
|
||||
print("[STEP 2] Parsing courses and metadata (this may take a few seconds)…")
|
||||
df = parse_courses_from_page(soup)
|
||||
if df.empty:
|
||||
print("[WARNING] No 2025-xxxx course rows found. The page structure may have changed.")
|
||||
print(" Please open the URL in a browser and check if '2025-2026' content is visible.")
|
||||
else:
|
||||
# Sanity: flag pillar rows (deel 3) visibly
|
||||
df["Pillar"] = df["Pillar"].replace({"": None})
|
||||
print(f"[INFO] Parsed {len(df)} course rows for 2025-2026.")
|
||||
|
||||
# Optional: give you a quick view in console
|
||||
head = df.head(10).to_string(index=False)
|
||||
print("[PREVIEW]\n" + head)
|
||||
|
||||
print(f"[STEP 3] Updating Excel: {EXCEL_PATH}")
|
||||
archive_and_write(EXCEL_PATH, df, TARGET_SHEET)
|
||||
|
||||
print("\nDone. You can now open the workbook and review the refreshed 'BAGES' sheet.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
# Try to save a copy of the corrected script beside the original.
|
||||
def save_copy(dest_name: str = "script_fixed.py"):
|
||||
try:
|
||||
import pathlib
|
||||
src = pathlib.Path(__file__)
|
||||
dst = src.with_name(dest_name)
|
||||
dst.write_text(src.read_text(encoding="utf-8"), encoding="utf-8")
|
||||
print(f"[INFO] Wrote a copy of this script to '{dst}'")
|
||||
except Exception as e:
|
||||
print(f"[WARNING] Could not write copy: {e}")
|
||||
|
||||
save_copy()
|
||||
BIN
ongeloofelijken tool/updated
Normal file
BIN
ongeloofelijken tool/updated
Normal file
Binary file not shown.
|
|
@ -181,22 +181,49 @@ def check_students_with_mismatching_SP_values(predelib_df: pd.DataFrame) -> List
|
|||
|
||||
logger.info("All required columns found in dataframe")
|
||||
|
||||
# Check for mismatching SP values
|
||||
mismatching_students = []
|
||||
for index, row in predelib_df.iterrows():
|
||||
if row['Totaal aantal SP'] != row['Aantal SP vereist']:
|
||||
mismatching_students.append({
|
||||
'ID': row['ID'],
|
||||
'Achternaam': row['Achternaam'],
|
||||
'Voornaam': row['Voornaam'],
|
||||
'E-mail': row['E-mail'],
|
||||
'Totaal_aantal_SP': row['Totaal aantal SP'],
|
||||
'Aantal_SP_vereist': row['Aantal SP vereist'],
|
||||
'Waarschuwing': row['Waarschuwing'],
|
||||
'Adviesrapport_code': row['Adviesrapport code']
|
||||
})
|
||||
# Use vectorized comparison to find rows where the SP values differ
|
||||
sp_col = predelib_df['Totaal aantal SP']
|
||||
req_col = predelib_df['Aantal SP vereist']
|
||||
|
||||
logger.info(f"Found {len(mismatching_students)} students with mismatching SP values")
|
||||
# Simple inequality works for most cases; NaN != NaN will be True which is acceptable
|
||||
mask = sp_col != req_col
|
||||
mismatches_df = predelib_df[mask].copy()
|
||||
|
||||
logger.info(f"Found {len(mismatches_df)} raw rows with mismatching SP values")
|
||||
|
||||
if mismatches_df.empty:
|
||||
logger.info("No students with mismatching SP values found")
|
||||
return []
|
||||
|
||||
# Keep only unique students by 'ID' (first occurrence).
|
||||
if 'ID' in mismatches_df.columns:
|
||||
before_dedup = len(mismatches_df)
|
||||
mismatches_df = mismatches_df.drop_duplicates(subset=['ID'])
|
||||
after_dedup = len(mismatches_df)
|
||||
logger.info(f"Reduced from {before_dedup} rows to {after_dedup} unique students by ID")
|
||||
else:
|
||||
logger.warning("Column 'ID' not found - cannot deduplicate by student ID")
|
||||
|
||||
# Ensure optional columns exist to avoid KeyError when building dicts
|
||||
for optional_col in ('Waarschuwing', 'Adviesrapport code'):
|
||||
if optional_col not in mismatches_df.columns:
|
||||
mismatches_df[optional_col] = None
|
||||
|
||||
# Build the list of mismatching students
|
||||
mismatching_students = []
|
||||
for _, row in mismatches_df.iterrows():
|
||||
mismatching_students.append({
|
||||
'ID': row.get('ID'),
|
||||
'Achternaam': row.get('Achternaam'),
|
||||
'Voornaam': row.get('Voornaam'),
|
||||
'E-mail': row.get('E-mail'),
|
||||
'Totaal_aantal_SP': row.get('Totaal aantal SP'),
|
||||
'Aantal_SP_vereist': row.get('Aantal SP vereist'),
|
||||
'Waarschuwing': row.get('Waarschuwing'),
|
||||
'Adviesrapport_code': row.get('Adviesrapport code')
|
||||
})
|
||||
|
||||
logger.info(f"Returning {len(mismatching_students)} unique students with mismatching SP values")
|
||||
return mismatching_students
|
||||
|
||||
except Exception as e:
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,4 +0,0 @@
|
|||
extracurriculaire vakken komen niet uit de wizard. aparte excel voor nodig, nl dashboard inschrijvingen.
|
||||
Voor de check of ze extra keuzevakken opnemen en de vereiste dus hoger moet komen te staan: vgl de kollomen Totaal aantal SP Aantal SP vereist
|
||||
|
||||
deze kolom moet ook worden gechecked als ze eigenlijk minder opnemen; dus die moeten altijd aan elkaar gelijk zijn.
|
||||
Loading…
Reference in New Issue
Block a user