| import dspy | |
| import random | |
| from dspy.datasets import Dataset | |
| from datasets import load_dataset | |
| from typing import Union, List, Mapping, Tuple | |
| class DataLoader(Dataset): | |
| def __init__(self,): | |
| pass | |
| def from_huggingface( | |
| self, | |
| dataset_name: str, | |
| *args, | |
| input_keys: Tuple[str] = (), | |
| fields: Tuple[str] = None, | |
| **kwargs | |
| ) -> Union[Mapping[str, List[dspy.Example]], List[dspy.Example]]: | |
| if fields and not isinstance(fields, tuple): | |
| raise ValueError(f"Invalid fields provided. Please provide a tuple of fields.") | |
| if not isinstance(input_keys, tuple): | |
| raise ValueError(f"Invalid input keys provided. Please provide a tuple of input keys.") | |
| dataset = load_dataset(dataset_name, *args, **kwargs) | |
| if isinstance(dataset, list) and isinstance(kwargs["split"], list): | |
| dataset = {split_name:dataset[idx] for idx, split_name in enumerate(kwargs["split"])} | |
| try: | |
| returned_split = {} | |
| for split_name in dataset.keys(): | |
| if fields: | |
| returned_split[split_name] = [dspy.Example({field:row[field] for field in fields}).with_inputs(input_keys) for row in dataset[split_name]] | |
| else: | |
| returned_split[split_name] = [dspy.Example({field:row[field] for field in row.keys()}).with_inputs(input_keys) for row in dataset[split_name]] | |
| return returned_split | |
| except AttributeError: | |
| if fields: | |
| return [dspy.Example({field:row[field] for field in fields}).with_inputs(input_keys) for row in dataset] | |
| else: | |
| return [dspy.Example({field:row[field] for field in row.keys()}).with_inputs(input_keys) for row in dataset] | |
| def from_csv(self, file_path:str, fields: List[str] = None, input_keys: Tuple[str] = ()) -> List[dspy.Example]: | |
| dataset = load_dataset("csv", data_files=file_path)["train"] | |
| if not fields: | |
| fields = list(dataset.features) | |
| return [dspy.Example({field:row[field] for field in fields}).with_inputs(input_keys) for row in dataset] | |
| def sample( | |
| self, | |
| dataset: List[dspy.Example], | |
| n: int, | |
| *args, | |
| **kwargs | |
| ) -> List[dspy.Example]: | |
| if not isinstance(dataset, list): | |
| raise ValueError(f"Invalid dataset provided of type {type(dataset)}. Please provide a list of examples.") | |
| return random.sample(dataset, n, *args, **kwargs) | |
| def train_test_split( | |
| self, | |
| dataset: List[dspy.Example], | |
| train_size: Union[int, float] = 0.75, | |
| test_size: Union[int, float] = None, | |
| random_state: int = None | |
| ) -> Mapping[str, List[dspy.Example]]: | |
| if random_state is not None: | |
| random.seed(random_state) | |
| dataset_shuffled = dataset.copy() | |
| random.shuffle(dataset_shuffled) | |
| if train_size is not None and isinstance(train_size, float) and (0 < train_size < 1): | |
| train_end = int(len(dataset_shuffled) * train_size) | |
| elif train_size is not None and isinstance(train_size, int): | |
| train_end = train_size | |
| else: | |
| raise ValueError("Invalid train_size. Please provide a float between 0 and 1 or an int.") | |
| if test_size is not None: | |
| if isinstance(test_size, float) and (0 < test_size < 1): | |
| test_end = int(len(dataset_shuffled) * test_size) | |
| elif isinstance(test_size, int): | |
| test_end = test_size | |
| else: | |
| raise ValueError("Invalid test_size. Please provide a float between 0 and 1 or an int.") | |
| if train_end + test_end > len(dataset_shuffled): | |
| raise ValueError("train_size + test_size cannot exceed the total number of samples.") | |
| else: | |
| test_end = len(dataset_shuffled) - train_end | |
| train_dataset = dataset_shuffled[:train_end] | |
| test_dataset = dataset_shuffled[train_end:train_end + test_end] | |
| return {'train': train_dataset, 'test': test_dataset} | |