# utils.py prompt: str, max_new_tokens: int = 192, temperature: float = 0.3, top_p: float = 0.95, repetition_penalty: float = 1.05, ) -> str: inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True) if torch.cuda.is_available(): inputs = {k: v.to(model.device) for k, v in inputs.items()} gen_ids = model.generate( **inputs, do_sample=(temperature > 0.0), temperature=temperature, top_p=top_p, max_new_tokens=max_new_tokens, repetition_penalty=repetition_penalty, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, ) out = tokenizer.decode(gen_ids[0], skip_special_tokens=True) if out.startswith(prompt): out = out[len(prompt):] return out.strip() def map_reduce_summarize( text: str, tokenizer, model, max_chunk_tokens: int = 900, overlap: int = 60, chunk_max_new_tokens: int = 128, final_max_new_tokens: int = 220, temperature: float = 0.2, top_p: float = 0.9, ) -> str: chunks = chunk_by_tokens(text, tokenizer, max_tokens=max_chunk_tokens, overlap=overlap) if len(chunks) == 1: prompt = f"{SYSTEM_PROMPT} {CHUNK_PROMPT.format(chunk=chunks[0])}" return generate_summary(tokenizer, model, prompt, max_new_tokens=final_max_new_tokens, temperature=temperature, top_p=top_p) partials: List[str] = [] for ck in chunks: p = f"{SYSTEM_PROMPT} {CHUNK_PROMPT.format(chunk=ck)}" s = generate_summary( tokenizer, model, p, max_new_tokens=chunk_max_new_tokens, temperature=temperature, top_p=top_p, ) partials.append(s) merged = " - ".join(partials) reduce_prompt = f"{SYSTEM_PROMPT} {REDUCE_PROMPT.format(partials='- ' + merged)}" final = generate_summary( tokenizer, model, reduce_prompt, max_new_tokens=final_max_new_tokens, temperature=max(0.1, temperature - 0.1), top_p=top_p, ) return final.strip()