Spaces:
Sleeping
Sleeping
veeps
commited on
Commit
·
63bb4b2
1
Parent(s):
ab911b6
update
Browse files
rag.py
CHANGED
|
@@ -7,6 +7,10 @@ from langchain.schema import (
|
|
| 7 |
)
|
| 8 |
from datasets import load_dataset
|
| 9 |
from pinecone import Pinecone
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
dataset = load_dataset(
|
|
@@ -35,4 +39,108 @@ res = chat(messages)
|
|
| 35 |
# add latest AI response to messages
|
| 36 |
messages.append(res)
|
| 37 |
|
| 38 |
-
# connect to pinecone
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
)
|
| 8 |
from datasets import load_dataset
|
| 9 |
from pinecone import Pinecone
|
| 10 |
+
from pinecone import ServerlessSpec
|
| 11 |
+
import time
|
| 12 |
+
from langchain_openai import OpenAIEmbeddings
|
| 13 |
+
from tqdm.auto import tqdm
|
| 14 |
|
| 15 |
|
| 16 |
dataset = load_dataset(
|
|
|
|
| 39 |
# add latest AI response to messages
|
| 40 |
messages.append(res)
|
| 41 |
|
| 42 |
+
# connect to pinecone
|
| 43 |
+
api_key = os.getenv('PINECONE_API_KEY')
|
| 44 |
+
|
| 45 |
+
# configure client
|
| 46 |
+
pc = Pinecone(api_key=api_key)
|
| 47 |
+
|
| 48 |
+
# connect to serverless
|
| 49 |
+
spec = ServerlessSpec(
|
| 50 |
+
cloud="aws", region="us-east-1"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# initialize index
|
| 54 |
+
index_name = 'llama-2-rag'
|
| 55 |
+
existing_indexes = [
|
| 56 |
+
index_info["name"] for index_info in pc.list_indexes()
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
# check if index already exists (it shouldn't if this is first time)
|
| 60 |
+
if index_name not in existing_indexes:
|
| 61 |
+
# if does not exist, create index
|
| 62 |
+
pc.create_index(
|
| 63 |
+
index_name,
|
| 64 |
+
dimension=1536, # dimensionality of ada 002
|
| 65 |
+
metric='dotproduct',
|
| 66 |
+
spec=spec
|
| 67 |
+
)
|
| 68 |
+
# wait for index to be initialized
|
| 69 |
+
while not pc.describe_index(index_name).status['ready']:
|
| 70 |
+
time.sleep(1)
|
| 71 |
+
|
| 72 |
+
# connect to index
|
| 73 |
+
index = pc.Index(index_name)
|
| 74 |
+
time.sleep(1)
|
| 75 |
+
# view index stats
|
| 76 |
+
index.describe_index_stats()
|
| 77 |
+
|
| 78 |
+
# create vector embeddings of our index
|
| 79 |
+
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")
|
| 80 |
+
|
| 81 |
+
# iterate over dataset
|
| 82 |
+
data = dataset.to_pandas()
|
| 83 |
+
batch_size = 100
|
| 84 |
+
|
| 85 |
+
for i in tqdm(range(0, len(data), batch_size)):
|
| 86 |
+
i_end = min(len(data), i+batch_size)
|
| 87 |
+
# get batch of data
|
| 88 |
+
batch = data.iloc[i:i_end]
|
| 89 |
+
# generate unique ids for each chunk
|
| 90 |
+
ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
|
| 91 |
+
# get text to embed
|
| 92 |
+
texts = [x['chunk'] for _, x in batch.iterrows()]
|
| 93 |
+
# embed text
|
| 94 |
+
embeds = embed_model.embed_documents(texts)
|
| 95 |
+
# get metadata to store in Pinecone
|
| 96 |
+
metadata = [
|
| 97 |
+
{'text': x['chunk'],
|
| 98 |
+
'source': x['source'],
|
| 99 |
+
'title': x['title']} for i, x in batch.iterrows()
|
| 100 |
+
]
|
| 101 |
+
# add to Pinecone
|
| 102 |
+
index.upsert(vectors=zip(ids, embeds, metadata))
|
| 103 |
+
|
| 104 |
+
index.describe_index_stats()
|
| 105 |
+
|
| 106 |
+
#### Retrival Augmented Generation
|
| 107 |
+
#from langchain_pinecone import PineconeVectoreStore
|
| 108 |
+
from langchain.vectorstores import Pinecone
|
| 109 |
+
|
| 110 |
+
# the metadata field that contains our text
|
| 111 |
+
text_field = "text"
|
| 112 |
+
|
| 113 |
+
# initialize the vector store object
|
| 114 |
+
vectorstore = Pinecone(
|
| 115 |
+
index, embed_model.embed_query, text_field
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
query = "What is so special about Llama 2?"
|
| 119 |
+
|
| 120 |
+
vectorstore.similarity_search(query, k=3)
|
| 121 |
+
|
| 122 |
+
# connect the output from vectorstore to chat
|
| 123 |
+
def augment_prompt(query: str):
|
| 124 |
+
# get top 3 results from knowledge base
|
| 125 |
+
results = vectorstore.similarity_search(query, k=3)
|
| 126 |
+
# get the text from the results
|
| 127 |
+
source_knowledge = "\n".join([x.page_content for x in results])
|
| 128 |
+
# feed into an augmented prompt
|
| 129 |
+
augmented_prompt = f"""Using the contexts below, answer the query.
|
| 130 |
+
|
| 131 |
+
Contexts:
|
| 132 |
+
{source_knowledge}
|
| 133 |
+
|
| 134 |
+
Query: {query}"""
|
| 135 |
+
return augmented_prompt
|
| 136 |
+
|
| 137 |
+
# create a new user prompt
|
| 138 |
+
prompt = HumanMessage(
|
| 139 |
+
content=augment_prompt(query)
|
| 140 |
+
)
|
| 141 |
+
# add to messages
|
| 142 |
+
messages.append(prompt)
|
| 143 |
+
|
| 144 |
+
res = chat(messages)
|
| 145 |
+
|
| 146 |
+
print(res.content)
|