Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| # Synonyms for each task category | |
| task_synonyms = { | |
| "Undergraduate level knowledge": ["undergraduate level knowledge", "MMLU"], | |
| "Graduate level reasoning": ["graduate level reasoning", "GPOA", "Diamond"], | |
| "Grade school math": ["grade school math", "GSM8K"], | |
| "Math problem-solving": ["math problem-solving", "MATH"], | |
| "Multilingual math": ["multilingual math", "MGSM"], | |
| "Code": ["code", "coding", "programming", "HumanEval"], | |
| "Reasoning over text": ["reasoning over text", "DROP", "F1 score"], | |
| "Mixed evaluations": ["mixed evaluations", "BIG-Bench-Hard"], | |
| "Knowledge Q&A": ["knowledge Q&A", "ARC-Challenge"], | |
| "Common Knowledge": ["common knowledge", "HellaSwag"], | |
| } | |
| # LLM performance data with scores | |
| performance_data = { | |
| "Undergraduate level knowledge": [("Claude 3 Opus", 86.8), ("GPT-4", 86.4), ("Gemini 1.0 Ultra", 83.7)], | |
| "Graduate level reasoning": [("Claude 3 Opus", 50.4), ("Claude 3 Sonnet", 40.4), ("GPT-4", 35.7)], | |
| "Grade school math": [("Claude 3 Opus", 95.0), ("Gemini 1.0 Ultra", 94.4), ("GPT-4", 92.0)], | |
| "Math problem-solving": [("Claude 3 Opus", 60.1), ("Gemini 1.0 Ultra", 53.2), ("GPT-4", 52.9)], | |
| "Multilingual math": [("Claude 3 Opus", 90.7), ("Claude 3 Sonnet", 83.5), ("Gemini 1.0 Ultra", 79.0)], | |
| "Code": [("Claude 3 Opus", 84.9), ("Gemini 1.0 Ultra", 74.4), ("Claude 3 Haiku", 75.9)], | |
| "Reasoning over text": [("Claude 3 Opus", 83.1), ("Gemini 1.0 Ultra", 82.4), ("GPT-4", 80.9)], | |
| "Mixed evaluations": [("Claude 3 Opus", 86.8), ("Gemini 1.0 Ultra", 83.6), ("GPT-4", 83.1)], | |
| "Knowledge Q&A": [("Claude 3 Opus", 96.4), ("GPT-4", 96.3), ("Claude 3 Sonnet", 93.2)], | |
| "Common Knowledge": [("Claude 3 Opus", 95.4), ("GPT-4", 95.3), ("Gemini 1.0 Ultra", 87.8)], | |
| } | |
| def recommend_llm(task): | |
| # Normalize the input task to match against synonyms | |
| task_lower = task.lower() | |
| main_category = None | |
| for key, synonyms in task_synonyms.items(): | |
| if task_lower in map(str.lower, synonyms): | |
| main_category = key | |
| break | |
| if not main_category: | |
| return "No data available" | |
| recommendations = performance_data.get(main_category, []) | |
| recommendations_sorted = sorted(recommendations, key=lambda x: x[1], reverse=True) | |
| result = f"For {task}, the recommended LLMs are:\n" | |
| for i, (model, score) in enumerate(recommendations_sorted): | |
| result += f"{i+1}. {model} with a score of {score}%\n" | |
| return result | |
| # Gradio interface | |
| interface = gr.Interface( | |
| fn=recommend_llm, | |
| inputs=gr.Textbox(label="Enter Task"), | |
| outputs=gr.Textbox(label="LLM Recommendations"), | |
| title="LLM Recommendation App", | |
| description="Enter a task to get recommendations for the best LLMs based on performance data. For example, you can enter 'coding', 'undergraduate level knowledge', etc." | |
| ) | |
| # Launch the app | |
| interface.launch() |