Skip to content

Dataset Resource Values

CceeOpenDataDatasetsResourcesValues(ccee)

Class used for handling CCEE values from dataset resources. Can be accessed via ccee.opendata.datasets.resources.values.

Source code in echo_ons/ccee_root.py
def __init__(self, ccee: e_o.Ccee) -> None:
    """Base class that all subclasses should inherit from.

    Parameters
    ----------
    ccee : Ccee
        Top level object carrying all functionality and the connection handler.
    """
    # check inputs
    if not isinstance(ccee, e_o.Ccee):
        raise ValueError(f"ccee must be of type Ccee, not {type(ccee)}")

    self._ccee: e_o.Ccee = ccee

get(dataset_name, resource_names=None, filters=None, output_type='DataFrame')

Gets the values from a dataset resource.

This uses ckan API to get the values from a dataset resource. More documentation at https://docs.ckan.org/en/2.11/maintaining/datastore.html#ckanext.datastore.logic.action.datastore_search

Keep in mind that CCEE Open Data portal launched at around June of 2024, so data before that is not available.

Parameters:

  • dataset_name

    (str) –

    Name of the dataset to get the resources from.

  • resource_names

    (list[str], default: None ) –

    List of resource names to get the values from. If None, gets all resources, by default None

  • filters

    (dict[str, Any], default: None ) –

    Filters to apply to the data. It is a dictionary with the keys being the field names and the values being the values to filter by. The values can be lists to filter by multiple values.

    If not set, no filtering is done. By default None

  • output_type

    (Literal['dict', 'DataFrame'], default: 'DataFrame' ) –

    Type of the output, by default "DataFrame"

Returns:

  • list[dict[str, Any]]

    If output_type is "dict", returns a list of dictionaries with the results.

  • DataFrame

    If output_type is "DataFrame", returns a DataFrame with the results. The index is the resource names.

Source code in echo_ons/ccee_opendata_datasets_resources_values.py
def get(
    self,
    dataset_name: str,
    resource_names: list[str] | None = None,
    filters: dict[str, Any] | None = None,
    output_type: Literal["dict", "DataFrame"] = "DataFrame",
) -> list[dict[str, Any]] | DataFrame:
    """Gets the values from a dataset resource.

    This uses ckan API to get the values from a dataset resource. More documentation at https://docs.ckan.org/en/2.11/maintaining/datastore.html#ckanext.datastore.logic.action.datastore_search

    Keep in mind that CCEE Open Data portal launched at around June of 2024, so data before that is not available.

    Parameters
    ----------
    dataset_name : str
        Name of the dataset to get the resources from.
    resource_names : list[str], optional
        List of resource names to get the values from. If None, gets all resources, by default None
    filters : dict[str, Any], optional
        Filters to apply to the data. It is a dictionary with the keys being the field names and the values being the values to filter by. The values can be lists to filter by multiple values.

        If not set, no filtering is done. By default None
    output_type : Literal["dict", "DataFrame"], optional
        Type of the output, by default "DataFrame"

    Returns
    -------
    list[dict[str, Any]]
        If output_type is "dict", returns a list of dictionaries with the results.
    DataFrame
        If output_type is "DataFrame", returns a DataFrame with the results. The index is the resource names.
    """
    if not isinstance(dataset_name, str):
        raise ValueError(f"dataset_name must be of type str, not {type(dataset_name)}")
    if not isinstance(resource_names, list | type(None)):
        raise ValueError(f"resource_names must be of type list, not {type(resource_names)}")
    if output_type not in ["dict", "DataFrame"]:
        raise ValueError(f"output_type must be 'dict' or 'DataFrame', not {output_type}")

    # searching for the dataset
    dataset = self._ccee.opendata.datasets.search("name", dataset_name, output_type="dict")
    if dataset_name not in dataset:
        raise ValueError(f"Dataset {dataset_name} not found")

    # getting the resources
    resources = self._ccee.opendata.datasets.resources.get(dataset_name, output_type="dict")
    if resource_names:
        wrong_names = set(resource_names) - set(resources.keys())
        if wrong_names:
            raise ValueError(f"Resource(s) {wrong_names} not found")
    else:
        resource_names = list(resources.keys())

    endpoint = "api/3/action/datastore_search"

    results = []
    for resource_name in resource_names:
        resource_id = resources[resource_name]["id"]

        finished = False
        offset = 0
        this_results = []
        n_per_request = 10000
        while not finished:
            args = {
                "resource_id": resource_id,
                "limit": n_per_request,
                "offset": offset,
                "filters": filters,
            }
            response = self._ccee._opendata_conn.get(endpoint, json=args)  # noqa: SLF001
            if not response.ok:
                raise ValueError(f"Failed to get data from CCEE open data. {response.text}")
            records = response.json()["result"]["records"]
            this_results.extend(records)
            if len(records) < n_per_request:
                finished = True
            else:
                offset += n_per_request

        results.extend(this_results)

    if output_type == "DataFrame":
        results = DataFrame(results)

    return results