commit f3b5792fe55fdcfe94bb2076d06814999d2038f9 Author: bdaneels Date: Mon Oct 14 15:20:17 2024 +0200 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8b123cd --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/Mentoraat_2024-2025.xlsx +/reinoud.xlsx +/sisa.xlsx \ No newline at end of file diff --git a/script.py b/script.py new file mode 100644 index 0000000..cc3a650 --- /dev/null +++ b/script.py @@ -0,0 +1,82 @@ +import pandas as pd +import logging +from typing import List, Optional + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') + +def load_excel(file_path: str, sheet_name: Optional[str] = None) -> pd.DataFrame: + """Load an Excel file into a DataFrame.""" + try: + df = pd.read_excel(file_path, sheet_name=sheet_name) + if isinstance(df, dict): + raise ValueError(f"Multiple sheets found in {file_path}. Please specify a sheet name.") + return df + except FileNotFoundError: + logging.error(f"File not found: {file_path}") + raise + except Exception as e: + logging.error(f"Error loading file {file_path}: {e}") + raise + +def check_duplicates(df: pd.DataFrame, column: str) -> List[str]: + """Check for duplicate values in a specified column.""" + duplicates = df[column].astype(str)[df[column].duplicated()] + return duplicates.tolist() + +def find_missing_ids(df1: pd.DataFrame, df2: pd.DataFrame, column: str) -> List[str]: + """Find IDs in df2 that are not in df1.""" + ids1 = df1[column].astype(str) + ids2 = df2[column].astype(str) + missing_ids = ids2[~ids2.isin(ids1)] + return missing_ids.tolist() + +def append_missing_ids(reinoud_df: pd.DataFrame, sisa_df: pd.DataFrame, column: str, reinoud_file: str) -> pd.DataFrame: + """Append missing IDs and corresponding Naam, Voornaam, Plan, and Campus emailadres to reinoud_df.""" + missing_ids = find_missing_ids(reinoud_df, sisa_df, column) + if missing_ids: + missing_rows = sisa_df[sisa_df[column].astype(str).isin(missing_ids)] + # Select only the specified columns + selected_columns = ['Rolnummer', 'Naam', 'Voornaam', 'Plan', 'Campus emailadres'] + missing_rows = missing_rows[selected_columns] + + # Rename 'Campus emailadres' to 'mail' for reinoud_df + missing_rows = missing_rows.rename(columns={'Campus emailadres': 'mail'}) + + # Append missing rows to reinoud_df + reinoud_df = pd.concat([reinoud_df, missing_rows], ignore_index=True) + + logging.info(f"Appended missing IDs to {reinoud_file}:") + for _, row in missing_rows.iterrows(): + logging.info(f"ID: {row[column]}, Naam: {row['Naam']}, Voornaam: {row['Voornaam']}, Plan: {row['Plan']}, mail: {row['mail']}") + else: + logging.info("No missing IDs to append.") + return reinoud_df + +def main(reinoud_file: str, sisa_file: str, column: str, reinoud_sheet: Optional[str] = None, sisa_sheet: Optional[str] = None): + # Load the Excel files + reinoud_df = load_excel(reinoud_file, sheet_name=reinoud_sheet) + sisa_df = load_excel(sisa_file, sheet_name=sisa_sheet) + + # Debug: Print columns of sisa_df + logging.info(f"Columns in {sisa_file}: {sisa_df.columns.tolist()}") + + # Check for duplicates in reinoud + duplicates = check_duplicates(reinoud_df, column) + if duplicates: + logging.info("Duplicate IDs in reinoud.xlsx:") + logging.info(duplicates) + else: + logging.info("No duplicates found in reinoud.xlsx.") + + # Append missing IDs from sisa to reinoud + reinoud_df = append_missing_ids(reinoud_df, sisa_df, column, reinoud_file) + + # Save the updated reinoud_df back to the Excel file + reinoud_df.to_excel(reinoud_file, sheet_name=reinoud_sheet, index=False) + logging.info(f"Updated {reinoud_file} saved.") + +if __name__ == "__main__": + # Example usage + # change the file names, column name, and sheet names as needed + main('reinoud.xlsx', 'sisa.xlsx', 'Rolnummer', reinoud_sheet='Actief', sisa_sheet='sheet1') \ No newline at end of file