Skip to content

load_dads

load_dads(project: str, year: int, columns: List[str], filters: Optional[List[Tuple[str, str, str]]] = None)

Loads the DADS data for a given year.

Parameters:

Name Type Description Default
project str

The name of the CASD project

required
year int

The year of the data to load.

required
columns List[str]

The columns to load.

required
filters Optional[List[Tuple[str, str, str]]]

The filters to apply. Each filter is a tuple of (column, operator, value). Defaults to None.

None

Returns:

Type Description
DataFrame

The loaded data.

Raises:

Type Description
ValueError

If the data is not available for the given year.

Source code in bozio_wasmer_simulations/datasets/loaders.py
def load_dads(
    project: str, year: int, columns: List[str], filters: Optional[List[Tuple[str, str, str]]] = None
):
    """
    Loads the DADS data for a given year.

    Args:
        project (str): The name of the CASD project
        year (int): The year of the data to load.
        columns (List[str]): The columns to load.
        filters (Optional[List[Tuple[str, str, str]]], optional): The filters to apply. Each filter is a tuple of (column, operator, value). Defaults to None.

    Returns:
        (pd.DataFrame): The loaded data.

    Raises:
        ValueError: If the data is not available for the given year.
    """
    # Initialisation du loader
    loader = Loader()
    # Distinction selon l'année
    if year < 2018:
        # Variables à conserver lors de l'import
        columns = [e.upper() for e in columns]
        # Initialisation de la liste résultat
        list_data_dads = []
        # Chemin d'accès aux données
        table_path = f"\\\casd.fr\\casdfs\\Projets\\{project}\\Data\\DADS_DADS Postes_{year}\\Régions"
        # Importation des différentes tables
        for i in tqdm([24, 27, 28, 32, 44, 52, 53, 75, 76, 84, 93, 94, 97, 99]):
            # Nom du jeu de données
            table_name = f"post{i}.sas7bdat"
            # Importation des données
            list_data_dads.append(
                loader.load(
                    path=os.path.join(table_name, table_path),
                    columns=columns,
                    filters=filters,
                )
            )
        # Concaténation des données
        data_dads = pd.concat(list_data_dads, axis=0, ignore_index=True)

    elif (year >= 2018) & (year < 2020):
        # Variables à conserver lors de l'import
        columns = [e.upper() for e in columns]
        # Initialisation de la liste résultat
        list_data_dads = []
        # Chemin d'accès aux données
        table_path = f"\\\casd.fr\\casdfs\\Projets\\{project}\\Data\\DADS_DADS Postes_{year}"
        # Importation des différentes tables
        for i in tqdm(range(1, 5)):
            # Nom du jeu de données
            table_name = f"post_{i}.sas7bdat"
            # Importation des données
            list_data_dads.append(
                loader.load(
                    path=os.path.join(table_name, table_path),
                    columns=columns,
                    filters=filters,
                )
            )
        # Concaténation des données
        data_dads = pd.concat(list_data_dads, axis=0, ignore_index=True)
    # Chemin d'accès aux données
    elif (year >= 2020) & (year < 2022):
        # Variables à conserver lors de l'import
        columns = [e.lower() for e in columns]
        # Chemin
        table_path = f"\\\casd.fr\\casdfs\\Projets\\{project}\\Data\\DADS_DADS Postes_{year}\\Format parquet"
        # Chargement des données
        data_dads = loader.load(path=table_path, columns=columns, filters=filters)
    elif year == 2022:
        # Variables à conserver lors de l'import
        columns = [e.lower() for e in columns]
        # Chemin
        table_path = f"\\\casd.fr\\casdfs\\Projets\\{project}\\Data\\DADS_DADS Postes_{year}"
        # Enumération des fichiers
        list_files = os.listdir(table_path)

        # Restriction aux fichiers parquet relatifs à l'année 2022
        data_dads = pd.concat(
            (
                loader.load(
                    path=f"{table_path}\\{file}", columns=columns, filters=filters
                )
                for file in list_files
                if ((file.endswith(".parquet")) & (str(year) in file))
            ),
            axis=0,
            join="outer",
            ignore_index=True,
        )
    else:
        raise ValueError(f"Data not available for year : {year}")
        # data_dads = pd.read_parquet(table_path, columns=columns, filters=filters)
    return data_dads