Spaces:

tonneli
/

EureCA

Sleeping

App Files Files Community

EureCA / dspy /datasets /hotpotqa.py

tonneli

Delete history

f5776d3 almost 2 years ago

raw

history blame contribute delete

3.29 kB

	import random

	from datasets import load_dataset
	from dspy.datasets.dataset import Dataset


	class HotPotQA(Dataset):
	def __init__(self, args, only_hard_examples=True, keep_details='dev_titles', unofficial_dev=True, *kwargs) -> None:
	super().__init__(args, *kwargs)
	assert only_hard_examples, "Care must be taken when adding support for easy examples." \
	"Dev must be all hard to match official dev, but training can be flexible."

	hf_official_train = load_dataset("hotpot_qa", 'fullwiki', split='train')
	hf_official_dev = load_dataset("hotpot_qa", 'fullwiki', split='validation')

	official_train = []
	for raw_example in hf_official_train:
	if raw_example['level'] == 'hard':
	if keep_details is True:
	keys = ['id', 'question', 'answer', 'type', 'supporting_facts']
	elif keep_details == 'dev_titles':
	keys = ['question', 'answer', 'supporting_facts']
	else:
	keys = ['question', 'answer']

	example = {k: raw_example[k] for k in keys}

	if 'supporting_facts' in example:
	example['gold_titles'] = set(example['supporting_facts']['title'])
	del example['supporting_facts']

	official_train.append(example)

	rng = random.Random(0)
	rng.shuffle(official_train)

	self._train = official_train[:len(official_train)*75//100]

	if unofficial_dev:
	self._dev = official_train[len(official_train)*75//100:]
	else:
	self._dev = None

	for example in self._train:
	if keep_details == 'dev_titles':
	del example['gold_titles']

	test = []
	for raw_example in hf_official_dev:
	assert raw_example['level'] == 'hard'
	example = {k: raw_example[k] for k in ['id', 'question', 'answer', 'type', 'supporting_facts']}
	if 'supporting_facts' in example:
	example['gold_titles'] = set(example['supporting_facts']['title'])
	del example['supporting_facts']
	test.append(example)

	self._test = test


	if __name__ == '__main__':
	from dsp.utils import dotdict

	data_args = dotdict(train_seed=1, train_size=16, eval_seed=2023, dev_size=200*5, test_size=0)
	dataset = HotPotQA(**data_args)

	print(dataset)
	print(dataset.train[0].question)
	print(dataset.train[15].question)

	print(len(dataset.train), len(dataset.dev), len(dataset.test))

	print(dataset.dev[0].question)
	print(dataset.dev[340].question)
	print(dataset.dev[937].question)

	"""
	What was the population of the city where Woodward Avenue ends in 2010?
	Where did the star , who is also an executive producer, of the Mick begin her carrer?
	16 1000 0
	Both London and German have seen attacks during war, there was one specific type of attack that Germany called the blitz, what did London call a similar attack?
	Pre-Madonna was a collection of demos by the singer who was a leading presence during the emergence of what network?
	Alan Mills composed the classic folk song that tells the story of what?
	"""