Spaces:
Sleeping
Sleeping
| import os | |
| import lmdb | |
| import numpy as np | |
| from torch.utils.data import Dataset | |
| from openrec.preprocess import create_operators, transform | |
| class TextLMDBDataSet(Dataset): | |
| def __init__(self, config, mode, logger, seed=None, epoch=1, task='rec'): | |
| super(TextLMDBDataSet, self).__init__() | |
| global_config = config['Global'] | |
| dataset_config = config[mode]['dataset'] | |
| loader_config = config[mode]['loader'] | |
| loader_config['batch_size_per_card'] | |
| data_dir = dataset_config['data_dir'] | |
| self.do_shuffle = loader_config['shuffle'] | |
| self.lmdb_sets = self.load_hierarchical_lmdb_dataset(data_dir) | |
| logger.info(f'Initialize indexs of datasets: {data_dir}') | |
| self.data_idx_order_list = self.dataset_traversal() | |
| if self.do_shuffle: | |
| np.random.shuffle(self.data_idx_order_list) | |
| self.ops = create_operators(dataset_config['transforms'], | |
| global_config) | |
| self.ext_op_transform_idx = dataset_config.get('ext_op_transform_idx', | |
| 1) | |
| ratio_list = dataset_config.get('ratio_list', [1.0]) | |
| self.need_reset = True in [x < 1 for x in ratio_list] | |
| def load_hierarchical_lmdb_dataset(self, data_dir): | |
| lmdb_sets = {} | |
| dataset_idx = 0 | |
| for dirpath, dirnames, filenames in os.walk(data_dir + '/'): | |
| if not dirnames: | |
| env = lmdb.open( | |
| dirpath, | |
| max_readers=32, | |
| readonly=True, | |
| lock=False, | |
| readahead=False, | |
| meminit=False, | |
| ) | |
| txn = env.begin(write=False) | |
| num_samples = int(txn.get('num-samples'.encode())) | |
| lmdb_sets[dataset_idx] = { | |
| 'dirpath': dirpath, | |
| 'env': env, | |
| 'txn': txn, | |
| 'num_samples': num_samples, | |
| } | |
| dataset_idx += 1 | |
| return lmdb_sets | |
| def dataset_traversal(self): | |
| lmdb_num = len(self.lmdb_sets) | |
| total_sample_num = 0 | |
| for lno in range(lmdb_num): | |
| total_sample_num += self.lmdb_sets[lno]['num_samples'] | |
| data_idx_order_list = np.zeros((total_sample_num, 2)) | |
| beg_idx = 0 | |
| for lno in range(lmdb_num): | |
| tmp_sample_num = self.lmdb_sets[lno]['num_samples'] | |
| end_idx = beg_idx + tmp_sample_num | |
| data_idx_order_list[beg_idx:end_idx, 0] = lno | |
| data_idx_order_list[beg_idx:end_idx, | |
| 1] = list(range(tmp_sample_num)) | |
| data_idx_order_list[beg_idx:end_idx, 1] += 1 | |
| beg_idx = beg_idx + tmp_sample_num | |
| return data_idx_order_list | |
| def get_ext_data(self): | |
| ext_data_num = 0 | |
| for op in self.ops: | |
| if hasattr(op, 'ext_data_num'): | |
| ext_data_num = getattr(op, 'ext_data_num') | |
| break | |
| load_data_ops = self.ops[:self.ext_op_transform_idx] | |
| ext_data = [] | |
| while len(ext_data) < ext_data_num: | |
| lmdb_idx, file_idx = self.data_idx_order_list[np.random.randint( | |
| len(self))] | |
| lmdb_idx = int(lmdb_idx) | |
| file_idx = int(file_idx) | |
| sample_info = self.get_lmdb_sample_info( | |
| self.lmdb_sets[lmdb_idx]['txn'], file_idx) | |
| if sample_info is None: | |
| continue | |
| label = sample_info | |
| data = {'label': label} | |
| data = transform(data, load_data_ops) | |
| if data is None: | |
| continue | |
| ext_data.append(data) | |
| return ext_data | |
| def get_lmdb_sample_info(self, | |
| txn, | |
| index, | |
| normalize_unicode=True, | |
| remove_whitespace=True, | |
| max_length=True): | |
| label_key = 'label-%09d'.encode() % index | |
| label = txn.get(label_key) | |
| if label is None: | |
| return None | |
| label = label.decode('utf-8') | |
| return label | |
| def __getitem__(self, idx): | |
| lmdb_idx, file_idx = self.data_idx_order_list[idx] | |
| lmdb_idx = int(lmdb_idx) | |
| file_idx = int(file_idx) | |
| sample_info = self.get_lmdb_sample_info( | |
| self.lmdb_sets[lmdb_idx]['txn'], file_idx) | |
| if sample_info is None: | |
| return self.__getitem__(np.random.randint(self.__len__())) | |
| label = sample_info | |
| data = {'label': label} | |
| outs = transform(data, self.ops) | |
| if outs is None: | |
| return self.__getitem__(np.random.randint(self.__len__())) | |
| return outs | |
| def __len__(self): | |
| return self.data_idx_order_list.shape[0] | |