Spaces:
Runtime error
Runtime error
Update ingest.py
Browse files
ingest.py
CHANGED
|
@@ -1,92 +1,52 @@
|
|
| 1 |
"""Load html from files, clean up, split, ingest into Weaviate."""
|
| 2 |
import os
|
| 3 |
from pathlib import Path
|
|
|
|
| 4 |
|
| 5 |
-
import
|
| 6 |
from bs4 import BeautifulSoup
|
| 7 |
from langchain.text_splitter import CharacterTextSplitter
|
|
|
|
|
|
|
|
|
|
| 8 |
|
|
|
|
| 9 |
|
| 10 |
def clean_data(data):
|
| 11 |
-
|
| 12 |
-
|
|
|
|
| 13 |
return "\n".join([t for t in text.split("\n") if t])
|
| 14 |
|
| 15 |
-
|
| 16 |
docs = []
|
| 17 |
metadatas = []
|
| 18 |
-
for p in Path("
|
| 19 |
if p.is_dir():
|
| 20 |
continue
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
| 25 |
|
| 26 |
text_splitter = CharacterTextSplitter(
|
| 27 |
separator="\n",
|
| 28 |
-
chunk_size=
|
| 29 |
-
chunk_overlap=
|
| 30 |
length_function=len,
|
| 31 |
)
|
| 32 |
|
| 33 |
documents = text_splitter.create_documents(docs, metadatas=metadatas)
|
| 34 |
|
|
|
|
|
|
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
url=WEAVIATE_URL,
|
| 39 |
-
additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
|
| 40 |
-
)
|
| 41 |
-
|
| 42 |
-
client.schema.delete_class("Paragraph")
|
| 43 |
-
client.schema.get()
|
| 44 |
-
schema = {
|
| 45 |
-
"classes": [
|
| 46 |
-
{
|
| 47 |
-
"class": "Paragraph",
|
| 48 |
-
"description": "A written paragraph",
|
| 49 |
-
"vectorizer": "text2vec-openai",
|
| 50 |
-
"moduleConfig": {
|
| 51 |
-
"text2vec-openai": {
|
| 52 |
-
"model": "ada",
|
| 53 |
-
"modelVersion": "002",
|
| 54 |
-
"type": "text",
|
| 55 |
-
}
|
| 56 |
-
},
|
| 57 |
-
"properties": [
|
| 58 |
-
{
|
| 59 |
-
"dataType": ["text"],
|
| 60 |
-
"description": "The content of the paragraph",
|
| 61 |
-
"moduleConfig": {
|
| 62 |
-
"text2vec-openai": {
|
| 63 |
-
"skip": False,
|
| 64 |
-
"vectorizePropertyName": False,
|
| 65 |
-
}
|
| 66 |
-
},
|
| 67 |
-
"name": "content",
|
| 68 |
-
},
|
| 69 |
-
{
|
| 70 |
-
"dataType": ["text"],
|
| 71 |
-
"description": "The link",
|
| 72 |
-
"moduleConfig": {
|
| 73 |
-
"text2vec-openai": {
|
| 74 |
-
"skip": True,
|
| 75 |
-
"vectorizePropertyName": False,
|
| 76 |
-
}
|
| 77 |
-
},
|
| 78 |
-
"name": "source",
|
| 79 |
-
},
|
| 80 |
-
],
|
| 81 |
-
},
|
| 82 |
-
]
|
| 83 |
-
}
|
| 84 |
|
| 85 |
-
|
|
|
|
|
|
|
| 86 |
|
| 87 |
-
|
| 88 |
-
for text in documents:
|
| 89 |
-
batch.add_data_object(
|
| 90 |
-
{"content": text.page_content, "source": str(text.metadata["source"])},
|
| 91 |
-
"Paragraph",
|
| 92 |
-
)
|
|
|
|
| 1 |
"""Load html from files, clean up, split, ingest into Weaviate."""
|
| 2 |
import os
|
| 3 |
from pathlib import Path
|
| 4 |
+
from markdown import markdown
|
| 5 |
|
| 6 |
+
import pickle
|
| 7 |
from bs4 import BeautifulSoup
|
| 8 |
from langchain.text_splitter import CharacterTextSplitter
|
| 9 |
+
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
|
| 10 |
+
from langchain.vectorstores import FAISS
|
| 11 |
+
from InstructorEmbedding import INSTRUCTOR
|
| 12 |
|
| 13 |
+
print(os.environ["HUGGINFACE_APIKEY"])
|
| 14 |
|
| 15 |
def clean_data(data):
|
| 16 |
+
html = markdown(data)
|
| 17 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 18 |
+
text = ''.join(soup.findAll(text=True))
|
| 19 |
return "\n".join([t for t in text.split("\n") if t])
|
| 20 |
|
|
|
|
| 21 |
docs = []
|
| 22 |
metadatas = []
|
| 23 |
+
for p in Path("docs").rglob("*"):
|
| 24 |
if p.is_dir():
|
| 25 |
continue
|
| 26 |
+
if str(p).lower().endswith(('.md', '.mdx')):
|
| 27 |
+
with open(p) as f:
|
| 28 |
+
print(p)
|
| 29 |
+
filename = os.path.splitext(p)[0]
|
| 30 |
+
docs.append(clean_data(f.read()))
|
| 31 |
+
metadatas.append({"source": filename})
|
| 32 |
|
| 33 |
text_splitter = CharacterTextSplitter(
|
| 34 |
separator="\n",
|
| 35 |
+
chunk_size=512,
|
| 36 |
+
chunk_overlap=64,
|
| 37 |
length_function=len,
|
| 38 |
)
|
| 39 |
|
| 40 |
documents = text_splitter.create_documents(docs, metadatas=metadatas)
|
| 41 |
|
| 42 |
+
print("making embedding")
|
| 43 |
+
embedding = HuggingFaceEmbeddings()
|
| 44 |
|
| 45 |
+
print("beginning construction of faiss")
|
| 46 |
+
search_index = FAISS.from_documents(documents, embedding)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
+
print("beginning pickle")
|
| 49 |
+
with open("docs.pkl", 'wb') as f:
|
| 50 |
+
pickle.dump(search_index, f)
|
| 51 |
|
| 52 |
+
print("Pickle complete")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|