Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from huggingface_hub import InferenceClient | |
| import os | |
| token = os.getenv("TOKEN") | |
| endpoint = os.getenv("ENDPOINT") | |
| # initialize InferenceClient | |
| client = InferenceClient(model="/static-proxy?url=https%3A%2F%2Fapi-inference.huggingface.co%2Fmodels%2Fmeta-llama%2FMeta-Llama-3.1-8B-Instruct%26quot%3B%3C%2Fspan%3E%2C token=token) | |
| # query client using streaming mode | |
| def inference(message, history): | |
| partial_message = "" | |
| for token in client.text_generation(message, max_new_tokens=100, stream=True): | |
| partial_message += token | |
| yield partial_message | |
| gr.ChatInterface( | |
| inference, | |
| chatbot=gr.Chatbot(height=300), | |
| textbox=gr.Textbox(placeholder="Chat with me!", container=False, scale=7), | |
| title="Gradio 🤝 TGI", | |
| description="This is the demo for Gradio UI consuming TGI endpoint with LLaMA 7B-Chat model.", | |
| theme="abidlabs/Lime", | |
| examples=["Are tomatoes vegetables?"], | |
| cache_examples=True, | |
| retry_btn="Retry", | |
| undo_btn="Undo", | |
| clear_btn="Clear", | |
| ).queue().launch() |