Skip to content

Datasets

CceeOpenDataDatasets(ccee)

Class used for handling CCEE datasets. Can be accessed via ccee.opendata.datasets.

Parameters:

  • ccee

    (Ccee) –

    Top level object carrying all functionality.

Source code in echo_ons/ccee_opendata_datasets.py
def __init__(self, ccee: e_o.Ccee) -> None:
    """Class used for handling CCEE datasets. Can be accessed via `ccee.opendata.datasets`.

    Parameters
    ----------
    ccee : Ccee
        Top level object carrying all functionality.
    """
    super().__init__(ccee)

    # * subclasses

    self.resources = CceeOpenDataDatasetsResources(ccee)

list()

List all datasets available.

Returns:

  • list[str]

    List of all datasets available.

Source code in echo_ons/ccee_opendata_datasets.py
def list(
    self,
) -> list[str]:
    """List all datasets available.

    Returns
    -------
    list[str]
        List of all datasets available.
    """
    endpoint = "api/3/action/package_list"

    response = self._ccee._opendata_conn.get(endpoint)  # noqa: SLF001

    if not response.ok:
        raise ValueError(f"Failed to get dataset list from CCEE open data. {response.text}")

    return response.json()["result"]

search(field_name, field_value, output_type='dict')

Search for a dataset by a specific field.

This uses ckan API to search for datasets by a specific field. More documentation at https://docs.ckan.org/en/2.11/api/index.html?highlight=package_search#ckan.logic.action.get.package_search

The most useful key is the "resources", which contais a list of dictionaries with the most important keys being "name" and "id", where the last one will be used to query the data by other APIs.

Parameters:

  • field_name

    (str) –

    Name of the field to search by, such as "title", "name", etc.

  • field_value

    (str) –

    Value of the field to search by.

  • output_type

    (Literal['dict', 'DataFrame'], default: 'dict' ) –

    Type of the output, by default "dict"

Returns:

  • dict[str, dict[str, Any]]

    If output_type is "dict", returns a dictionary with the results. The keys are the dataset names.

  • DataFrame

    If output_type is "DataFrame", returns a DataFrame with the results. The index is the dataset names.

Source code in echo_ons/ccee_opendata_datasets.py
def search(
    self,
    field_name: str,
    field_value: str,
    output_type: Literal["dict", "DataFrame"] = "dict",
) -> dict[str, dict[str, Any]] | DataFrame:
    """Search for a dataset by a specific field.

    This uses ckan API to search for datasets by a specific field. More documentation at https://docs.ckan.org/en/2.11/api/index.html?highlight=package_search#ckan.logic.action.get.package_search

    The most useful key is the "resources", which contais a list of dictionaries with the most important keys being "name" and "id", where the last one will be used to query the data by other APIs.

    Parameters
    ----------
    field_name : str
        Name of the field to search by, such as "title", "name", etc.
    field_value : str
        Value of the field to search by.
    output_type : Literal["dict", "DataFrame"], optional
        Type of the output, by default "dict"

    Returns
    -------
    dict[str, dict[str, Any]]
        If output_type is "dict", returns a dictionary with the results. The keys are the dataset names.
    DataFrame
        If output_type is "DataFrame", returns a DataFrame with the results. The index is the dataset names.
    """
    if not isinstance(field_name, str):
        raise ValueError(f"field_name must be of type str, not {type(field_name)}")
    if not isinstance(field_value, str):
        raise ValueError(f"field_value must be of type str, not {type(field_value)}")
    if output_type not in ["dict", "DataFrame"]:
        raise ValueError(f"output_type must be 'dict' or 'DataFrame', not {output_type}")

    endpoint = "api/3/action/package_search"

    payload = {
        "q": f"{field_name}:{field_value}",
        "rows": 1000,
        "start": 0,
    }

    got_all = False
    page_num = 0
    results = []
    while not got_all:
        payload["start"] += page_num * 1000
        response = self._ccee._opendata_conn.get(endpoint, json=payload)  # noqa: SLF001

        if not response.ok:
            raise ValueError(f"Failed to get data from CCEE open data. {response.text}")

        response = response.json()

        if len(response["result"]["results"]) < 1000:
            got_all = True

        results.extend(response["result"]["results"])

        page_num += 1

    # converting to output_type
    if output_type == "dict":
        return {result["name"]: result for result in results}

    df = DataFrame.from_dict(results, orient="columns")
    df = df.set_index("name")
    return df