"""Define a class wrapper for Stats Can functions."""
from pathlib import Path
from stats_can import sc
from stats_can import scwds
[docs]class StatsCan:
"""Load Statistics Canada data and metadata into python.
Parameters
----------
data_folder: Path/str, default None
The location to save/search for locally stored Statistics Canada
data tables. Defaults to the current working directory
"""
def __init__(self, data_folder=None):
if data_folder is None:
self.data_folder = Path.cwd()
else:
self.data_folder = Path(data_folder) # Get a path object even if passed str
@property
def downloaded_tables(self):
"""Check which tables you've downloaded.
Checks the file "stats_can.h5" in the instantiated data folder
and lists all tables stored there.
Returns
-------
[table_ids]
"""
stat_h5 = self.data_folder / "stats_can.h5"
if stat_h5.exists():
full_meta = sc.list_h5_tables(path=self.data_folder, h5file="stats_can.h5")
tables = [item["productId"] for item in full_meta]
else:
tables = []
return tables
[docs] def table_to_df(self, table):
"""Read a table to a dataframe.
Parameters
----------
table: str
The ID of the table of interest, e.g "271-000-22"
Returns
-------
pandas.DataFrame
Dataframe of the requested table
If the table has been previously loaded to the file in self.data_folder
it will retrieve that locally stored dataframe. If it's unavailable it will
download it and then return the table. To update a locally stored table,
call StatsCan.update_tables(), optionally passing just the table number of interest
"""
return sc.table_to_df(table=table, path=self.data_folder, h5file="stats_can.h5")
[docs] @staticmethod
def vectors_to_df_remote(
vectors, periods=1, start_release_date=None, end_release_date=None
):
"""Retrieve V# data directly from Statistics Canada.
Parameters
----------
vectors: str or [str]
V#(s) to retrieve data for
periods: int, default 1
Number of periods to retrieve data.
Note that this will be ignored if start_release_date and end_release date
are set
start_release_date: datetime.date, default None
earliest release date to retrieve data
end_release_date: datetime.date, default None
latest release date to retrieve data
Returns
-------
pandas.DataFrame
Dataframe indexed on reference (not release) date, with columns for each V#
input
Note that start and end release date refer to the dates the data was released,
not the reference period they cover. For example. October labour force survey
data is released on the first or second Friday of November.
"""
return sc.vectors_to_df(vectors, periods, start_release_date, end_release_date)
[docs] def vectors_to_df(self, vectors, start_date=None):
"""Get a dataframe of V#s.
Parameters
----------
vectors: str or [str]
the V#s to retrieve
start_date: datetime.date, optional
earliest reference period to return, defaults to all available history
Returns
-------
pandas.DataFrame
Dataframe indexed on reference date, with columns for each V# input
Note that any V#s in tables that are not currently locally stored will
have their tables downloaded prior to returning the dataframe
"""
return sc.vectors_to_df_local(
vectors=vectors,
path=self.data_folder,
start_date=start_date,
h5file="stats_can.h5",
)
[docs] def update_tables(self, tables=None):
"""Update locally stored tables.
Compares latest available reference period in locally stored tables to the
latest available on Statistics Canada and updates any tables necessary
Parameters
----------
tables: str or [str], default None
Optional subset of tables to check for updates, defaults to update all
downloaded tables
Returns
-------
[str] list of tables that were updated, empty list if no updates made
"""
return sc.update_tables(
path=self.data_folder, h5file="stats_can.h5", tables=tables, csv=True
)
[docs] def delete_tables(self, tables):
"""Remove locally stored tables.
Parameters
----------
tables: str or [str]
tables to delete
Returns
-------
[deleted tables]
"""
return sc.delete_tables(
tables=tables, path=self.data_folder, h5file="stats_can.h5", csv=True
)
[docs] @staticmethod
def get_code_sets():
"""Get code sets.
Code sets provide additional metadata to describe variables and are grouped into
scales, frequencies, symbols etc.
Returns
-------
code_sets: [dict]
one dictionary for each group of information
"""
return scwds.get_code_sets()
[docs] @staticmethod
def vectors_updated_today():
"""Get a list of all V#s that were updated today.
Returns
-------
changed_series: [dict]
one dictionary for each vector with its update date
"""
return scwds.get_changed_series_list()
[docs] @staticmethod
def tables_updated_today():
"""Get a list of tables that were updated today.
Returns
-------
changed_tables: [dict]
one dictionary for each table with its update date
"""
return scwds.get_changed_cube_list()
[docs] @staticmethod
def tables_updated_on_date(date):
"""Get a list of tables that were updated on a given date.
Parameters
----------
date: str or datetime.date
The date to check tables
Returns
-------
changed_tables: [dict]
one dictionary for each table with its update date
"""
return scwds.get_changed_cube_list(date)
[docs] @staticmethod
def get_tables_for_vectors(vectors):
"""Find which table(s) a V# or list of V#s is from.
Parameters
----------
vectors: str or [str]
V#(s) to look up tables for
Returns
-------
dictionary of vector:table pairs plus an
"all_tables" key with a list of all tables
containing the input V#s
>>> StatsCan.get_tables_for_vectors("v39050")
{39050: '10100139', 'all_tables': ['10100139']}
>>> StatsCan..get_tables_for_vectors(["v39050", "v1074250274"])
{39050: '10100139', 1074250274: '16100011', 'all_tables': ['10100139', '16100011']}
"""
return sc.get_tables_for_vectors(vectors)