Spaces:
Runtime error
Runtime error
| """Load html from files, clean up, split, ingest into Weaviate.""" | |
| import os | |
| from pathlib import Path | |
| from markdown import markdown | |
| import pickle | |
| import re | |
| from bs4 import BeautifulSoup | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.embeddings import HuggingFaceInstructEmbeddings | |
| from langchain.vectorstores import FAISS | |
| from InstructorEmbedding import INSTRUCTOR | |
| print(os.environ["HUGGINFACE_APIKEY"]) | |
| def clean_data(data): | |
| html = markdown(data) | |
| soup = BeautifulSoup(html, "html.parser") | |
| text = ''.join(soup.findAll(text=True)) | |
| cleaned_text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL) | |
| print(cleaned_text) | |
| return "\n".join([t for t in cleaned_text.split("\n") if t]) | |
| docs = [] | |
| metadatas = [] | |
| for p in Path("docs").rglob("*"): | |
| if p.is_dir(): | |
| continue | |
| if str(p).lower().endswith(('.md', '.mdx')): | |
| with open(p) as f: | |
| filename = os.path.splitext(p)[0] | |
| docs.append(clean_data(f.read())) | |
| newfile_name = filename.replace("\\", "/")[5:] | |
| print("file:" + newfile_name) | |
| metadatas.append({"source": newfile_name}) | |
| text_splitter = CharacterTextSplitter( | |
| separator="\n", | |
| chunk_size=768, | |
| chunk_overlap=128, | |
| length_function=len, | |
| ) | |
| documents = text_splitter.create_documents(docs, metadatas=metadatas) | |
| print("making embedding") | |
| model_name = "hkunlp/instructor-large" | |
| embed_instruction = "Represent the text from the Hugging Face code documentation" | |
| query_instruction = "Query the most relevant text from the Hugging Face code documentation" | |
| embedding = HuggingFaceInstructEmbeddings(model_name=model_name, embed_instruction=embed_instruction, query_instruction=query_instruction) | |
| print("beginning construction of faiss") | |
| search_index = FAISS.from_documents(documents, embedding) | |
| print("beginning pickle") | |
| with open("docs.pkl", 'wb') as f: | |
| pickle.dump(search_index, f) | |
| print("Pickle complete") |