rt-datasets-forecasting/utils.py at main · readytensor/rt-datasets-forecasting · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pandas as pd
import numpy as np
import json
import os
from typing import Dict, Any, Type

def strip_quotes(val: str) -> str:
    """
    Strips leading and trailing quotes from a string.
    Args:
    val (str): The string to strip quotes from.

    Returns:
    str: The stripped string.
    """
    if isinstance(val, str) and len(val) >= 2:
        if (val[0] == val[-1]) and val.startswith(("'", '"')):
            return val[1:-1]
    return val

def load_metadata(dataset_cfg_path: str) -> pd.DataFrame:
    """
    Loads the dataset metadata.

    Args:
    dataset_cfg_path (str): The path to the dataset configuration file.

    Returns:
    pd.DataFrame: The dataset metadata.
    """
    # Load the dataset metadata
    dataset_metadata = pd.read_csv(dataset_cfg_path)
    # Apply the function to each element in the DataFrame
    dataset_metadata = dataset_metadata.apply(strip_quotes)
    return dataset_metadata

def load_features_config(features_cfg_path: str) -> pd.DataFrame:
    """
    Loads the data features configuration.

    Args:
    features_cfg_path (str): The path to the features configuration file.

    Returns:
    pd.DataFrame: The data features configuration.
    """
    # Load the data features configuration
    data_features_config = pd.read_csv(features_cfg_path, encoding='latin-1')
    return data_features_config

def load_dataset(dataset_name:str, processed_datasets_path:str):
    """
    Read dataset

    Args:
    dataset_name (str): Name of the dataset.
    processed_datasets_path (str): Path where processed data files are to be saved per dataset.

    Returns:
    pd.DataFrame: The data features configuration.
    """
    dataset_path = os.path.join(processed_datasets_path, dataset_name, f"{dataset_name}.csv")
    dataset = pd.read_csv(dataset_path)

    return dataset

def load_schema(dataset_name:str, processed_datasets_path:str)->Dict[str, Any]:
    """
    Load and return schema for given dataset.

    Args:
    dataset_name (str): Name of the dataset.
    processed_datasets_path (str): Path where processed data files are to be saved per dataset.

    Returns:
    Dict: The data features configuration.
    """
    schema_path = os.path.join(processed_datasets_path, dataset_name, f"{dataset_name}_schema.json")
    with open(schema_path, 'r', encoding="utf-8") as file_:
        schema = json.load(file_)
    return schema

class JSONEncoder(json.JSONEncoder):
    """
    Custom JSONEncoder class to handle Numpy data types.
    """
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.bool_):
            return bool(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(JSONEncoder, self).default(obj)


def convert_numpy_types(data: Dict[str, Any]) -> Dict[str, Any]:
    """
    Convert numpy data types to native Python data types.

    Args:
        data (dict): The dictionary to convert.

    Returns:
        dict: The converted dictionary.
    """

    for key, value in data.items():
        if isinstance(value, (np.integer, np.int64)):
            data[key] = int(value)
        elif isinstance(value, (np.floating, np.float64)):
            data[key] = float(value)
        elif isinstance(value, np.bool_):
            data[key] = bool(value)
        elif value is pd.NaT or pd.isnull(value):
            data[key] = None
        elif isinstance(value, pd.Timestamp):
            data[key] = str(value)

    return data