first commit

This commit is contained in:
bdaneels 2024-11-18 11:59:21 +01:00
parent d51c60d76d
commit e3e65a9c51
12 changed files with 1997 additions and 0 deletions

11
.gitignore vendored Normal file
View File

@ -0,0 +1,11 @@
# Ignore .idea directories
*.idea/
# Ignore compiled Python files
*.pyc
__pycache__/
# Ignore Excel files
*.xlsx
sisa_crawl/

20
examen dubbels/script.py Normal file
View File

@ -0,0 +1,20 @@
import pandas as pd
#variables
file_path = 'ps (30).xlsx'
sheet_name = 'ps (30)'
column_name = 'Student-ID'
df = pd.read_excel(file_path, sheet_name=sheet_name)
duplicate_ids= df[df.duplicated(subset=[column_name], keep=False)][column_name]
unique_duplicate_ids = duplicate_ids.drop_duplicates()
num_duplicates = len(unique_duplicate_ids)
if not unique_duplicate_ids.empty:
print(f"Duplicated Student-ID values (count: {num_duplicates}) :")
print(unique_duplicate_ids)
else:
print("No duplicates found.")

View File

@ -0,0 +1,18 @@
import pandas as pd
file_path = 'file.xlsx'
sheet_name = 'ps (32)'
df = pd.read_excel(file_path, sheet_name=sheet_name)
filtered_df = df[df['Examenvorm'] == 'Schriftelijk' ]
filtered_df = filtered_df[filtered_df['Aant. inschr.'] > 65]
filtered_df = filtered_df[['Datum S+','Beginuur S+','Einduur S+', 'Studiegidsnr.', 'Omschrijving', 'Docenten', 'Aant. inschr.']]
#formatting the timestrings
filtered_df['Beginuur S+'] = filtered_df['Beginuur S+'].apply(lambda x: x.strftime('%H:%M'))
filtered_df['Einduur S+'] = filtered_df['Einduur S+'].apply(lambda x: x.strftime('%H:%M'))
filtered_df['Docenten'] = filtered_df['Docenten'].str.replace(r'\b(Titularis|Co-Titularis|Medewerker)\b', '',
regex=True).str.strip()
filtered_df.to_excel('filtered_grote_lokalen.xlsx', index=False)

View File

@ -0,0 +1,7 @@
<xml xmlns:o="urn:schemas-microsoft-com:office:office">
<o:MainFile HRef="../ps%20(32).xls"/>
<o:File HRef="stylesheet.css"/>
<o:File HRef="tabstrip.htm"/>
<o:File HRef="sheet001.htm"/>
<o:File HRef="filelist.xml"/>
</xml>

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,175 @@
tr
{mso-height-source:auto;}
col
{mso-width-source:auto;}
br
{mso-data-placement:same-cell;}
.style0
{mso-number-format:General;
text-align:general;
vertical-align:bottom;
white-space:nowrap;
mso-rotate:0;
mso-background-source:auto;
mso-pattern:auto;
color:black;
font-size:11.0pt;
font-weight:400;
font-style:normal;
text-decoration:none;
font-family:Calibri, sans-serif;
mso-font-charset:0;
border:none;
mso-protection:locked visible;
mso-style-name:Normal;
mso-style-id:0;}
td
{mso-style-parent:style0;
padding-top:1px;
padding-right:1px;
padding-left:1px;
mso-ignore:padding;
color:black;
font-size:11.0pt;
font-weight:400;
font-style:normal;
text-decoration:none;
font-family:Calibri, sans-serif;
mso-font-charset:0;
mso-number-format:General;
text-align:general;
vertical-align:bottom;
border:none;
mso-background-source:auto;
mso-pattern:auto;
mso-protection:locked visible;
white-space:nowrap;
mso-rotate:0;}
.xl65
{mso-style-parent:style0;
font-size:10.0pt;
font-weight:700;
text-align:center;
vertical-align:middle;
border:.5pt solid black;
white-space:normal;}
.xl66
{mso-style-parent:style0;
border:.5pt solid black;
white-space:normal;}
.xl67
{mso-style-parent:style0;
font-size:10.0pt;
border:.5pt solid black;
white-space:normal;}
.xl68
{mso-style-parent:style0;
font-size:10.0pt;
mso-number-format:"Short Date";
border:.5pt solid black;
white-space:normal;}
.xl69
{mso-style-parent:style0;
font-size:10.0pt;
mso-number-format:"Short Time";
border:.5pt solid black;
white-space:normal;}
.xl70
{mso-style-parent:style0;
font-size:10.0pt;
border-top:.5pt solid black;
border-right:.5pt solid black;
border-bottom:none;
border-left:.5pt solid black;
white-space:normal;}
.xl71
{mso-style-parent:style0;
font-size:10.0pt;
border-top:none;
border-right:.5pt solid black;
border-bottom:none;
border-left:.5pt solid black;
white-space:normal;}
.xl72
{mso-style-parent:style0;
font-size:10.0pt;
border-top:none;
border-right:.5pt solid black;
border-bottom:.5pt solid black;
border-left:.5pt solid black;
white-space:normal;}
.xl73
{mso-style-parent:style0;
border-top:.5pt solid black;
border-right:.5pt solid black;
border-bottom:none;
border-left:.5pt solid black;
white-space:normal;}
.xl74
{mso-style-parent:style0;
border-top:none;
border-right:.5pt solid black;
border-bottom:none;
border-left:.5pt solid black;
white-space:normal;}
.xl75
{mso-style-parent:style0;
border-top:none;
border-right:.5pt solid black;
border-bottom:.5pt solid black;
border-left:.5pt solid black;
white-space:normal;}
.xl76
{mso-style-parent:style0;
font-size:10.0pt;
mso-number-format:"Short Date";
border-top:.5pt solid black;
border-right:.5pt solid black;
border-bottom:none;
border-left:.5pt solid black;
white-space:normal;}
.xl77
{mso-style-parent:style0;
font-size:10.0pt;
mso-number-format:"Short Date";
border-top:none;
border-right:.5pt solid black;
border-bottom:none;
border-left:.5pt solid black;
white-space:normal;}
.xl78
{mso-style-parent:style0;
font-size:10.0pt;
mso-number-format:"Short Date";
border-top:none;
border-right:.5pt solid black;
border-bottom:.5pt solid black;
border-left:.5pt solid black;
white-space:normal;}
.xl79
{mso-style-parent:style0;
font-size:10.0pt;
mso-number-format:"Short Time";
border-top:.5pt solid black;
border-right:.5pt solid black;
border-bottom:none;
border-left:.5pt solid black;
white-space:normal;}
.xl80
{mso-style-parent:style0;
font-size:10.0pt;
mso-number-format:"Short Time";
border-top:none;
border-right:.5pt solid black;
border-bottom:none;
border-left:.5pt solid black;
white-space:normal;}
.xl81
{mso-style-parent:style0;
font-size:10.0pt;
mso-number-format:"Short Time";
border-top:none;
border-right:.5pt solid black;
border-bottom:.5pt solid black;
border-left:.5pt solid black;
white-space:normal;}

View File

@ -0,0 +1,32 @@
<html>
<head>
<meta http-equiv=Content-Type content="text/html; charset=utf-8">
<meta name=ProgId content=Excel.Sheet>
<meta name=Generator content="Microsoft Excel 15">
<link id=Main-File rel=Main-File href="../ps%20(32).xls">
<script language="JavaScript">
<!--
if (window.name!="frTabs")
window.location.replace(document.all.item("Main-File").href);
//-->
</script>
<style>
<!--
A {
text-decoration:none;
color:#000000;
font-size:9pt;
}
-->
</style>
</head>
<body topmargin=0 leftmargin=0 bgcolor="#808080">
<table border=0 cellspacing=1>
<tr>
<td bgcolor="#FFFFFF" nowrap><b><small><small>&nbsp;<a href="sheet001.htm" target="frSheet"><font face="Arial" color="#000000">ps (32)</font></a>&nbsp;</small></small></b></td>
</tr>
</table>
</body>
</html>

View File

@ -0,0 +1,78 @@
import pandas as pd
def read_excel_file(file_path):
"""Read the Excel file and return a DataFrame."""
try:
return pd.read_excel(file_path)
except Exception as e:
print(f"Error reading the Excel file: {e}")
return None
def filter_studiegidsnummer(df):
"""Filter rows where 'studiegidsnummer' contains 'GES'."""
if 'Studiegidsnummer' not in df.columns:
print("Column 'studiegidsnummer' not found in the DataFrame.")
print("Available columns:", df.columns)
return pd.DataFrame() # Return an empty DataFrame
return df[df['Studiegidsnummer'].str.contains('GES', na=False)].copy()
def filter_opmerkingen(df):
"""Filter rows where 'Opmerkingen' does NOT contain '24-25'."""
if 'Opmerkingen' not in df.columns:
print("Column 'Opmerkingen' not found in the DataFrame.")
print("Available columns:", df.columns)
return pd.DataFrame() # Return an empty DataFrame
return df[~df['Opmerkingen'].str.contains('24-25', na=False)].copy()
def create_message_column(df):
"""Create 'Message' and 'subject' columns with the specified format."""
df.loc[:, 'Message'] = df.apply(lambda row: (
f"Beste docent,\n\n"
f"Ik ben de examengegevens aan het controleren van {row['Omschrijving']} {row['Studiegidsnummer']}. De huidige gegevens zijn als volgt:\n\n"
f"{row['Examenvorm']} examen voor zowel eerste als tweede zit, {row['Examenduur']} minuten, tussen {row['Beginuur voormiddag']} en {row['Einduur voormiddag']} of {row['Beginuur namiddag']} en {row['Einduur namiddag']}.\n\n"
f"Gelden dezelfde gegevens voor dit academiejaar of moeten er nog wijzigingen doorgevoerd worden? Alvast dank voor je reactie!"
), axis=1)
df.loc[:, 'subject'] = df.apply(lambda row: (
f"Examengegevens {row['Omschrijving']} {row['Studiegidsnummer']}"
), axis=1)
return df
def save_to_excel(df, output_file_path):
"""Save the DataFrame to a new Excel file."""
try:
df.to_excel(output_file_path, index=False)
except Exception as e:
print(f"Error saving the Excel file: {e}")
def convert_time_format(time_str):
"""Convert time from 'HH:MM:SS' to 'HH:MM'."""
try:
return pd.to_datetime(time_str).strftime('%H:%M')
except Exception as e:
print(f"Error converting time format: {e}")
return time_str
def apply_time_format_conversion(df, columns):
"""Apply time format conversion to specified columns in the DataFrame."""
for column in columns:
df[column] = pd.to_datetime(df[column], format='%H:%M:%S', errors='coerce').dt.strftime('%H:%M')
return df
# Example usage within the main function
def main():
file_path = 'examengegevens2425.xlsx'
output_file_path = 'filtered_examengegevens2425.xlsx'
df = read_excel_file(file_path)
if df is not None:
filtered_df = filter_studiegidsnummer(df)
if not filtered_df.empty:
final_filtered_df = filter_opmerkingen(filtered_df)
# Convert time format for specified columns
time_columns = ['Beginuur voormiddag', 'Einduur voormiddag', 'Beginuur namiddag', 'Einduur namiddag']
final_filtered_df = apply_time_format_conversion(final_filtered_df, time_columns)
final_filtered_df = create_message_column(final_filtered_df)
save_to_excel(final_filtered_df, output_file_path)
if __name__ == "__main__":
main()

131
random/script 2.py Normal file
View File

@ -0,0 +1,131 @@
import pandas as pd
from openpyxl import load_workbook
from dateutil import parser
import re
def list_sheets(file):
try:
workbook = load_workbook(filename=file, read_only=True)
sheets = workbook.sheetnames
return sheets
except Exception as e:
print(e)
raise ValueError(f"Could not open the file '{file}'. Please check the file and try again.")
def dutch_date_parser(date_str):
# Remove Dutch day names
day_name_pattern = r'\b(maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)\b'
date_str = re.sub(day_name_pattern, '', date_str, flags=re.IGNORECASE).strip()
# Translate Dutch month names to English
month_translation = {
'januari': 'January', 'februari': 'February', 'maart': 'March',
'april': 'April', 'mei': 'May', 'juni': 'June', 'juli': 'July',
'augustus': 'August', 'september': 'September', 'oktober': 'October',
'november': 'November', 'december': 'December'
}
for dutch_month, english_month in month_translation.items():
date_str = re.sub(r'\b' + dutch_month + r'\b', english_month, date_str, flags=re.IGNORECASE)
# Try parsing the modified date string
try:
return parser.parse(date_str, dayfirst=True)
except ValueError:
return pd.NaT
def compare_roosters(base_file, comparison_file, output_file):
# Print the sheets available in both Excel files
base_sheets = list_sheets(base_file)
comparison_sheets = list_sheets(comparison_file)
print(f"Sheets in '{base_file}': {base_sheets}")
print(f"Sheets in '{comparison_file}': {comparison_sheets}")
# Function to load an Excel file with error handling
def load_excel(file):
try:
df = pd.read_excel(file, engine='openpyxl')
if df.empty:
raise ValueError(f"The file '{file}' has no sheets or is empty.")
return df
except Exception as e:
print(e)
raise ValueError(f"Could not load the file '{file}'. Please check the file and try again.")
# Load the Excel files
base_df = load_excel(base_file)
comparison_df = load_excel(comparison_file)
# Ensure the columns we need are present in both files
required_columns = ['Code examenrooster', 'Beginuur S+', 'Datum S+', 'Einduur S+']
for column in required_columns:
if column not in base_df.columns or column not in comparison_df.columns:
raise ValueError(f"Column '{column}' is missing from one of the files")
# Convert 'Datum S+' in comparison_df to the universal format
comparison_df['Datum S+'] = comparison_df['Datum S+'].apply(
lambda x: dutch_date_parser(x) if isinstance(x, str) else x
)
# Merge the dataframes on 'Code examenrooster' to compare the rows with matching codes
merged_df = base_df.merge(
comparison_df,
on='Code examenrooster',
suffixes=('_base', '_comp'),
how='outer', # Outer join to capture all differences
indicator=True # Indicator to show if the row was in one or both files
)
# Create an empty list to store rows with differences
differences = []
# Iterate over each row to find discrepancies
for _, row in merged_df.iterrows():
row_data = {}
# Only compare rows that exist in both files
if row['_merge'] == 'both':
differences_in_row = []
# Compare the columns
for column in ['Beginuur S+', 'Datum S+', 'Einduur S+']:
base_value = row.get(f"{column}_base", pd.NA)
comp_value = row.get(f"{column}_comp", pd.NA)
if pd.isna(base_value) and pd.isna(comp_value):
continue # Skip comparison if both are NaN
elif base_value != comp_value:
differences_in_row.append(f"{column} differs (Base: {base_value}, Comp: {comp_value})")
# If there are any differences in this row, add them to the differences list
if differences_in_row:
for col in required_columns:
row_data[col] = row.get(col, pd.NA)
row_data[f"{col}_comp"] = row.get(f"{col}_comp", pd.NA)
row_data['Difference'] = "; ".join(differences_in_row)
differences.append(row_data)
elif row['_merge'] == 'left_only':
differences.append({
'Code examenrooster': row['Code examenrooster'],
'Difference': "Row missing in comparison file"
})
elif row['_merge'] == 'right_only':
differences.append({
'Code examenrooster': row['Code examenrooster'],
'Difference': "Row missing in base file"
})
# Create a DataFrame from the differences list
differences_df = pd.DataFrame(differences)
# Save the differences to an Excel file
differences_df.to_excel(output_file, index=False)
print(f"Differences saved to {output_file}")
# Example usage:
compare_roosters('afgewerkte.xlsx', 'bages rooster voor s.xlsx', 'differences_output.xlsx')

66
random/script.py Normal file
View File

@ -0,0 +1,66 @@
import pandas as pd
from datetime import datetime
import locale
file_path = 'bages rooster voor s.xlsx'
sheet_name = 'rooster'
df = pd.read_excel(file_path, sheet_name=sheet_name)
date_ranges = {
(pd.Timestamp('2025-01-06'), pd.Timestamp('2025-01-12')): 16,
(pd.Timestamp('2025-01-13'), pd.Timestamp('2025-01-19')): 17,
(pd.Timestamp('2025-01-20'), pd.Timestamp('2025-01-26')): 18,
(pd.Timestamp('2025-01-27'), pd.Timestamp('2025-02-02')): 19,
# add more ranges as needed
}
# Custom date parser function
def parse_custom_date(date_str):
if pd.isna(date_str):
return pd.NaT # Return pandas NaT for missing dates
if isinstance(date_str, str):
try:
# Set locale to Dutch
locale.setlocale(locale.LC_TIME, 'nl_NL.UTF-8')
return datetime.strptime(date_str, '%A %d %B %Y')
except ValueError as e:
raise ValueError(f"Date conversion error: {e} for date string: {date_str}")
finally:
# Reset locale to the default setting
locale.setlocale(locale.LC_TIME, 'C')
else:
raise TypeError(f"Expected string, got {type(date_str).__name__}: {date_str}")
# Ensure the column 'Datum S+' exists and is processed correctly
if 'Datum S+' in df.columns:
try:
# Convert 'Datum S+' column to datetime using the custom parser
df['Datum S+'] = df['Datum S+'].apply(parse_custom_date)
except (ValueError, TypeError) as e:
print(f"Error: {e}")
# Optionally, re-raise the exception if you want to stop execution
raise
# Function to update Lesweek based on date ranges
def update_lesweek(date):
if pd.isna(date): # Handle NaT values
return 0
for date_range, lesweek_value in date_ranges.items():
if date_range[0] <= date <= date_range[1]:
return lesweek_value
return 0 # Default value if date doesn't fall in any range
# Apply the function to 'Datum S+' column
df['Lesweek'] = df['Datum S+'].apply(update_lesweek)
# Check the results
print("\nFirst few rows of the DataFrame to verify date formatting:\n", df.head())
# If needed, you can save the DataFrame to a new Excel file to verify changes
df.to_excel('updated_rooster.xlsx', index=False)

View File

@ -0,0 +1,63 @@
import asyncio
from pyppeteer import launch
import logging
logging.basicConfig(level=logging.INFO)
async def crawl(url):
try:
# Launch a new Chromium browser with a visible window
print('browser launching')
browser = await launch(headless=False)
# Open a new page
page = await browser.newPage()
print('browser opened')
try:
# Navigate to the specified URL
await page.goto(url)
logging.info(f"Accessed {url}")
except Exception as e:
logging.error(f"Failed to navigate to {url}: {e}")
await browser.close()
return
try:
# Wait for the page to fully load
await page.waitForSelector('body')
except Exception as e:
logging.error(f"Failed to load the page properly: {e}")
await browser.close()
return
try:
# Extract the content of the page
content = await page.content()
# (Optional) Extract and print all links as an example
links = await page.evaluate('''() => {
return Array.from(document.querySelectorAll('a')).map(link => ({
text: link.innerText,
url: link.href
}));
}''')
for link in links:
print(f"Link text: {link['text']}, URL: {link['url']}")
except Exception as e:
logging.error(f"Error extracting or processing the content: {e}")
finally:
# Ensure the browser closes after execution
await browser.close()
except Exception as e:
logging.critical(f"Critical error occurred: {e}")
# Specify the URL of the web page you want to crawl
url = 'https://www.google.com/'
# Run the crawl function
asyncio.get_event_loop().run_until_complete(crawl(url))

View File