Spaces:
Build error
Build error
| import os | |
| import uuid | |
| import gradio as gr | |
| from dotenv import load_dotenv | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_core.runnables import RunnableLambda, RunnablePassthrough | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_openai import ChatOpenAI | |
| from langchain.chains import RetrievalQA | |
| from langchain_community.document_loaders import UnstructuredURLLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores.utils import filter_complex_metadata | |
| import smtplib | |
| from email.mime.text import MIMEText | |
| from email.mime.multipart import MIMEMultipart | |
| import logging | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from typing import List | |
| from langchain_core.documents import Document | |
| from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader | |
| from langchain_unstructured import UnstructuredLoader | |
| from langchain import hub | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import Chroma | |
| from langchain.vectorstores import Chroma | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_core.runnables import RunnablePassthrough | |
| import os | |
| import bs4 | |
| from sentence_transformers import SentenceTransformer | |
| from langchain_openai import OpenAIEmbeddings, ChatOpenAI | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| import ollama | |
| from langchain.embeddings import OllamaEmbeddings, HuggingFaceEmbeddings | |
| from langchain_ollama import OllamaEmbeddings | |
| import numpy as np | |
| from sklearn.decomposition import PCA | |
| import matplotlib.pyplot as plt | |
| import chromadb | |
| import uuid | |
| import os | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| load_dotenv() | |
| os.environ['LANGCHAIN_TRACING_V2'] = 'true' | |
| os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com' | |
| os.environ['LANGCHAIN_API_KEY'] | |
| os.environ["OPENAI_API_KEY"] | |
| ef clean_text(text): | |
| '''this functionn clean the output of the webmloader ''' | |
| text = text.replace('\xa0', ' ') | |
| text = re.sub(r'[\n\r\t]+', ' ', text) | |
| text = re.sub(r'\s+', ' ', text) | |
| return text.strip() | |
| chroma_db_path = "./chroma_db" | |
| chroma_client = chromadb.PersistentClient(path=chroma_db_path) | |
| data = chroma_client.get_collection(name="my_dataaaa") | |
| file_path = ( | |
| "Charte.pdf" | |
| ) | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| document0=pages[0].page_content | |
| document0 | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits1 = text_splitter.split_text(document0) | |
| splits1 | |
| embeddings1 = embeddings_model.embed_documents( | |
| splits1 | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| ids1 = [str(uuid.uuid4()) for _ in range(len(splits1))] | |
| data.add( | |
| documents=splits1, | |
| embeddings=embeddings1, | |
| ids=ids1 | |
| ) | |
| file_path = "circulaire 35-2010.pdf" | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| document1=[page.page_content for doc in pages] | |
| document1 | |
| document1 = "\n".join(document1) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits2 = text_splitter.split_text(document1) | |
| splits2 | |
| embeddings2 = embeddings_model.embed_documents( | |
| splits2, | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| ids2 = [str(uuid.uuid4()) for _ in range(len(splits2))] | |
| data.add( | |
| documents=splits2, | |
| embeddings=embeddings2, | |
| ids=ids2 | |
| ) | |
| file_path = "Demande de prolongation de stage MP2 Physique.pdf" | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| document2 = [page.page_content for doc in pages] | |
| document2 | |
| document2 = "\n".join(document2) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits3 = text_splitter.split_text(document2) | |
| splits3 | |
| embeddings3 = embeddings_model.embed_documents( | |
| splits3, | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| ids3 = [str(uuid.uuid4()) for _ in range(len(splits3))] | |
| data.add( | |
| documents=splits3, | |
| embeddings=embeddings3, | |
| ids=ids3 | |
| ) | |
| file_path = "dérogation pdf.pdf" | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| document3=[page.page_content for doc in pages] | |
| document3 | |
| document3 = "\n".join(document3) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits4 = text_splitter.split_text(document3) | |
| splits4 | |
| embeddings4 = embeddings_model.embed_documents( | |
| splits4, | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| ids4 = [str(uuid.uuid4()) for _ in range(len(splits4))] | |
| data.add( | |
| documents=splits4, | |
| embeddings=embeddings4, | |
| ids=ids4 | |
| ) | |
| file_path = "Fiche d'évaluation de stage.pdf" | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| document4=[page.page_content for doc in pages] | |
| document4 | |
| document4 = "\n".join(document4) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits5 = text_splitter.split_text(document4) | |
| splits5 | |
| embeddings5 = embeddings_model.embed_documents( | |
| splits5, | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| ids5 = [str(uuid.uuid4()) for _ in range(len(splits5))] | |
| data.add( | |
| documents=splits5, | |
| embeddings=embeddings5, | |
| ids=ids5 | |
| ) | |
| file_path = "النظام الداخلي لكلية العلوم بالمنستير.pdf" | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| document5=[page.page_content for doc in pages] | |
| document5 | |
| document5 = "\n".join(document5) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits6 = text_splitter.split_text(document5) | |
| splits6 | |
| embeddings6 = embeddings_model.embed_documents( | |
| splits6, | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| ids6 = [str(uuid.uuid4()) for _ in range(len(splits6))] | |
| data.add( | |
| documents=splits6, | |
| embeddings=embeddings6, | |
| ids=ids6 | |
| ) | |
| file_path = "sante_mentale.pdf" | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| document6=[page.page_content for doc in pages] | |
| document6 | |
| document6 = "\n".join(document6) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits7 = text_splitter.split_text(document6) | |
| splits7 | |
| embeddings7 = embeddings_model.embed_documents( | |
| splits7, | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| ids7 = [str(uuid.uuid4()) for _ in range(len(splits7))] | |
| data.add( | |
| documents=splits7, | |
| embeddings=embeddings7, | |
| ids=ids7 | |
| ) | |
| file_path = "sante_mentale2.pdf" | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| document7=[page.page_content for doc in pages] | |
| document7 | |
| document7 = "\n".join(document7) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits8 = text_splitter.split_text(document7) | |
| splits8 | |
| embeddings8 = embeddings_model.embed_documents( | |
| splits8, | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| ids8 = [str(uuid.uuid4()) for _ in range(len(splits8))] | |
| data.add( | |
| documents=splits8, | |
| embeddings=embeddings8, | |
| ids=ids8 | |
| ) | |
| file_path = "score_pour_mastere.pdf" | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| # In[99]: | |
| document8=[page.page_content for doc in pages] | |
| # In[100]: | |
| document8 | |
| # # splitting DOC8 into chunks | |
| # In[102]: | |
| document8 = "\n".join(document8) | |
| # In[103]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits9 = text_splitter.split_text(document8) | |
| # In[104]: | |
| splits9 | |
| # In[105]: | |
| embeddings9 = embeddings_model.embed_documents( | |
| splits9, | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| # In[106]: | |
| ids9 = [str(uuid.uuid4()) for _ in range(len(splits9))] | |
| # In[107]: | |
| data.add( | |
| documents=splits9, | |
| embeddings=embeddings9, | |
| ids=ids9 | |
| ) | |
| # # Master RECHERCHE | |
| # # Document 9 Recherche chimie | |
| # In[110]: | |
| file_path = "recherche_chimie.pdf" | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| # In[111]: | |
| document9=[page.page_content for doc in pages] | |
| # In[112]: | |
| document9 | |
| # # splitting DOC9 into chunks | |
| # In[114]: | |
| document9= "\n".join(document9) | |
| # In[115]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits10 = text_splitter.split_text(document9) | |
| # In[116]: | |
| splits10 | |
| # In[117]: | |
| embeddings10 = embeddings_model.embed_documents( | |
| splits10, | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| # In[118]: | |
| ids10 = [str(uuid.uuid4()) for _ in range(len(splits10))] | |
| # In[119]: | |
| data.add( | |
| documents=splits10, | |
| embeddings=embeddings10, | |
| ids=ids10 | |
| ) | |
| # # Document 10 Recherche info | |
| # In[121]: | |
| file_path = "recherche_info.pdf" | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| # In[122]: | |
| document10=[page.page_content for doc in pages] | |
| # In[123]: | |
| document10 | |
| # # splitting DOC10 into chunks | |
| # In[125]: | |
| document10= "\n".join(document10) | |
| # In[126]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits11 = text_splitter.split_text(document10) | |
| # In[127]: | |
| splits11 | |
| # In[128]: | |
| embeddings11 = embeddings_model.embed_documents( | |
| splits11, | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| # In[129]: | |
| ids11 = [str(uuid.uuid4()) for _ in range(len(splits11))] | |
| # In[130]: | |
| data.add( | |
| documents=splits11, | |
| embeddings=embeddings11, | |
| ids=ids11 | |
| ) | |
| # # Document 11 Recherche physique | |
| # In[132]: | |
| file_path = "recherche_phy.pdf" | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| # In[133]: | |
| document11=[page.page_content for doc in pages] | |
| # In[134]: | |
| document11 | |
| # # splitting DOC11 into chunks | |
| # In[136]: | |
| document11= "\n".join(document11) | |
| # In[137]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits12 = text_splitter.split_text(document11) | |
| # In[138]: | |
| splits12 | |
| # In[139]: | |
| embeddings12 = embeddings_model.embed_documents( | |
| splits12, | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| # In[140]: | |
| ids12 = [str(uuid.uuid4()) for _ in range(len(splits12))] | |
| # In[141]: | |
| data.add( | |
| documents=splits12, | |
| embeddings=embeddings12, | |
| ids=ids12 | |
| ) | |
| # # Mastere Pro | |
| # # Document 12 PRO chimie | |
| # In[144]: | |
| file_path = "pro_chimie.pdf" | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| # In[145]: | |
| document12=[page.page_content for doc in pages] | |
| # In[146]: | |
| document12 | |
| # # splitting DOC 12 into chunks | |
| # In[148]: | |
| document12= "\n".join(document12) | |
| # In[149]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits13= text_splitter.split_text(document12) | |
| # In[150]: | |
| splits13 | |
| # In[151]: | |
| embeddings13 = embeddings_model.embed_documents( | |
| splits13, | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| # In[152]: | |
| ids13 = [str(uuid.uuid4()) for _ in range(len(splits13))] | |
| # In[153]: | |
| data.add( | |
| documents=splits13, | |
| embeddings=embeddings13, | |
| ids=ids13 | |
| ) | |
| # # Document 13 PRO info | |
| # In[155]: | |
| file_path = "pro_info.pdf" | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| # In[156]: | |
| document13=[page.page_content for doc in pages] | |
| # In[157]: | |
| document13 | |
| # # splitting DOC 13 into chunks | |
| # In[159]: | |
| document13= "\n".join(document13) | |
| # In[160]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits14= text_splitter.split_text(document13) | |
| # In[161]: | |
| splits14 | |
| # In[162]: | |
| embeddings14 = embeddings_model.embed_documents( | |
| splits14, | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| # In[163]: | |
| ids14 = [str(uuid.uuid4()) for _ in range(len(splits14))] | |
| # In[164]: | |
| data.add( | |
| documents=splits14, | |
| embeddings=embeddings14, | |
| ids=ids14 | |
| ) | |
| # # Document 14 on peut effectuer deux stages en meme temps | |
| # In[166]: | |
| file_path = "deux_stage_.pdf" | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| # In[167]: | |
| document14=[page.page_content for doc in pages] | |
| # In[168]: | |
| document14 | |
| # # splitting DOC14 INTO chunks | |
| # In[170]: | |
| document14= "\n".join(document14) | |
| # In[171]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits15= text_splitter.split_text(document14) | |
| # In[172]: | |
| splits15 | |
| # In[173]: | |
| embeddings15= embeddings_model.embed_documents( | |
| splits15, | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| # In[174]: | |
| ids15 = [str(uuid.uuid4()) for _ in range(len(splits15))] | |
| # In[175]: | |
| data.add( | |
| documents=splits15, | |
| embeddings=embeddings15, | |
| ids=ids15 | |
| ) | |
| # # Document 15 des question avec reponse | |
| # In[177]: | |
| file_path = "Les avantages de la carte étudiante.pdf" | |
| loader = PyPDFLoader(file_path) | |
| pages = [] | |
| async for page in loader.alazy_load(): | |
| pages.append(page) | |
| # In[178]: | |
| document15=[page.page_content for doc in pages] | |
| # In[179]: | |
| document15 | |
| # # Splitting DOC15 into chunks | |
| # In[181]: | |
| document15= "\n".join(document15) | |
| # In[182]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50, separators=["\n\n", "\n", ".", " ", "\n•"]) | |
| splits16= text_splitter.split_text(document15) | |
| # In[183]: | |
| splits16 | |
| # In[184]: | |
| embeddings16 = embeddings_model.embed_documents( | |
| splits16, | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| # In[185]: | |
| ids16 = [str(uuid.uuid4()) for _ in range(len(splits16))] | |
| # In[186]: | |
| data.add( | |
| documents=splits16, | |
| embeddings=embeddings16, | |
| ids=ids16 | |
| ) | |
| # # Checking does the data is added or not ✅ | |
| # In[188]: | |
| data = data.get(include=['embeddings']) | |
| print(data) | |
| # embeddings_model = SentenceTransformer("HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5") | |
| embeddings_model = HuggingFaceEmbeddings(model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5") | |
| # # Configure `ChromaDB` for our work | |
| # In[29]: | |
| # chroma_client.delete_collection(name="my_dataaaa") # Deletes "my_dataaaa" | |
| # In[30]: | |
| chroma_db_path = "./chroma_db" | |
| chroma_client = chromadb.PersistentClient(path=chroma_db_path) | |
| # In[31]: | |
| data = chroma_client.get_or_create_collection(name="my_dataaaa") | |
| # # <p style="color: orange;">Document 0 Masteres-Procedure-de-Depot</p> | |
| # In[33]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/pages/152/Masteres-Procedure-de-Depot",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| Masteres_Procedure_de_Depot = loader.load() | |
| # In[34]: | |
| Masteres_Procedure_de_Depot = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in Masteres_Procedure_de_Depot] | |
| Masteres_Procedure_de_Depot | |
| # ## spliiting into chunks the doc0 | |
| # In[36]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100) | |
| splits1 = text_splitter.split_documents( Masteres_Procedure_de_Depot) | |
| # In[37]: | |
| splits1 | |
| # ## Saving to chromadb in data | |
| # In[39]: | |
| contents1 = [doc.page_content for doc in splits1] | |
| metadata1 = [doc.metadata for doc in splits1] | |
| # In[40]: | |
| embeddings1 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits1], | |
| #normalize_embeddings=True, | |
| #batch_size=256, | |
| #show_progress_bar=True | |
| ) | |
| print(embeddings1) | |
| # In[41]: | |
| ids = [str(uuid.uuid4()) for _ in range(len(contents1))] | |
| # In[42]: | |
| data.add( | |
| documents=contents1, | |
| embeddings=embeddings1, | |
| metadatas=metadata1, | |
| ids=ids | |
| ) | |
| # In[43]: | |
| # visulizing in a dataframe | |
| data_dict = { | |
| "ID": ids, | |
| "Document": contents1, | |
| "Metadata": metadata1, | |
| "Embedding Shape": [np.array(embed).shape for embed in embeddings1], | |
| } | |
| df = pd.DataFrame(data_dict) | |
| df.tail() | |
| # In[44]: | |
| def append_data(contents, metadata, embeddings): | |
| '''this function will append the embeddings and metadata and | |
| the document into the data_dict so we can visulize how it looks in chrom ''' | |
| global df | |
| new_ids = list(range(len(df) + 1, len(df) + 1 + len(contents))) | |
| data_dict["ID"].extend(new_ids) | |
| data_dict["Document"].extend(contents) | |
| data_dict["Metadata"].extend(metadata) | |
| data_dict["Embedding Shape"].extend([np.array(embed).shape for embed in embeddings]) | |
| df = pd.DataFrame(data_dict) | |
| # # <p style="color: orange;">Document 1 Theses-Inscriptions-etProcedure-de-Depot</p> | |
| # In[46]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/pages/147/Theses-Inscriptions-etProcedure-de-Depot",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| Theses_Inscriptions_etProcedure_de_Depot = loader.load() | |
| # In[47]: | |
| Theses_Inscriptions_etProcedure_de_Depot = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in Theses_Inscriptions_etProcedure_de_Depot] | |
| Theses_Inscriptions_etProcedure_de_Depot | |
| # ## splitting into chunks the doc1 | |
| # In[49]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits2 = text_splitter.split_documents( Theses_Inscriptions_etProcedure_de_Depot) | |
| # In[50]: | |
| splits2 | |
| # In[51]: | |
| contents2= [doc.page_content for doc in splits2] | |
| metadata2 = [doc.metadata for doc in splits2] | |
| # In[52]: | |
| embeddings2 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits2], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings2) | |
| # In[53]: | |
| ids2= [str(uuid.uuid4()) for _ in range(len(contents2))] | |
| # In[54]: | |
| data.add( | |
| documents=contents2, | |
| embeddings=embeddings2, | |
| metadatas=metadata2, | |
| ids=ids2 | |
| ) | |
| # In[55]: | |
| append_data(contents2, metadata2, embeddings2) | |
| # In[56]: | |
| df | |
| # # <p style="color: orange;"> Document 2 رشة_بعنوان_أهمية_الصحة_النفسية</p> | |
| # In[58]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/articles/4798/%D9%88%D8%B1%D8%B4%D8%A9-%D8%A8%D8%B9%D9%86%D9%88%D8%A7%D9%86-%D8%A3%D9%87%D9%85%D9%8A%D8%A9-%D8%A7%D9%84%D8%B5%D8%AD%D8%A9-%D8%A7%D9%84%D9%86%D9%81%D8%B3%D9%8A%D8%A9",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| warcha_mental_health = loader.load() | |
| # In[59]: | |
| warcha_mental_health = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in warcha_mental_health] | |
| warcha_mental_health | |
| # ## spitting doc 2 into chunks | |
| # In[61]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits3 = text_splitter.split_documents( warcha_mental_health) | |
| # In[62]: | |
| splits3 | |
| # In[63]: | |
| contents3= [doc.page_content for doc in splits3] | |
| metadata3 = [doc.metadata for doc in splits3] | |
| # In[64]: | |
| embeddings3 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits3], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings3) | |
| # In[65]: | |
| ids3 = [str(uuid.uuid4()) for _ in range(len(contents3))] | |
| # In[66]: | |
| data.add( | |
| documents=contents3, | |
| embeddings=embeddings3, | |
| metadatas=metadata3, | |
| ids=ids3 | |
| ) | |
| # In[67]: | |
| append_data(contents3, metadata3, embeddings3) | |
| # In[68]: | |
| df.tail() | |
| # # <p style="color: orange;"> Document 3 festival-de-la-creativite-estudiantine</p> | |
| # In[70]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/articles/4795/festival-de-la-creativite-estudiantine",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| festival_de_la_creativite_estudiantinet = loader.load() | |
| # In[71]: | |
| festival_de_la_creativite_estudiantinet = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in festival_de_la_creativite_estudiantinet] | |
| festival_de_la_creativite_estudiantinet | |
| # ## splitting the Doc3 into chunks | |
| # In[73]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits4 = text_splitter.split_documents( festival_de_la_creativite_estudiantinet) | |
| # In[74]: | |
| print(splits4[0].page_content) # First chunk's content | |
| print(splits4[0].metadata) | |
| # In[75]: | |
| contents4= [doc.page_content for doc in splits4] | |
| metadata4 = [doc.metadata for doc in splits4] | |
| # In[76]: | |
| embeddings4 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits4], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings4) | |
| # In[77]: | |
| ids4 = [str(uuid.uuid4()) for _ in range(len(contents4))] | |
| # In[78]: | |
| data.add( | |
| documents=contents4, | |
| embeddings=embeddings4, | |
| metadatas=metadata4, | |
| ids=ids4 | |
| ) | |
| # In[79]: | |
| append_data(contents4, metadata4, embeddings4) | |
| # In[80]: | |
| df | |
| # # <p style="color: orange;"> Document 4 bourses-d-alternance-2025</p> | |
| # In[82]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/articles/4813/bourses-d-alternance-2025",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| Bourse_alternance = loader.load() | |
| # In[83]: | |
| Bourse_alternance = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in Bourse_alternance] | |
| Bourse_alternance | |
| # ## splitting doc 4 into chunks | |
| # In[85]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits5 = text_splitter.split_documents( Bourse_alternance) | |
| # In[86]: | |
| print(splits5[2].page_content) | |
| print(splits5[2].metadata) | |
| # In[87]: | |
| contents5= [doc.page_content for doc in splits5] | |
| metadata5 = [doc.metadata for doc in splits5] | |
| # In[88]: | |
| embeddings5 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits5], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings5) | |
| # In[89]: | |
| ids5 = [str(uuid.uuid4()) for _ in range(len(contents5))] | |
| # In[90]: | |
| data.add( | |
| documents=contents5, | |
| embeddings=embeddings5, | |
| metadatas=metadata5, | |
| ids=ids5 | |
| ) | |
| # In[91]: | |
| append_data(contents5, metadata5, embeddings5) | |
| # In[92]: | |
| df | |
| # # <p style="color: orange;"> Document 5 the-indian-council-for-cultural-relations--iccr</p> | |
| # In[94]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/articles/4807/the-indian-council-for-cultural-relations--iccr-",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| the_indian_council_for_cultural_relations = loader.load() | |
| # In[95]: | |
| the_indian_council_for_cultural_relations = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in the_indian_council_for_cultural_relations] | |
| the_indian_council_for_cultural_relations | |
| # ## splitting doc 5 into chunks | |
| # In[97]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits6 = text_splitter.split_documents( the_indian_council_for_cultural_relations) | |
| # In[98]: | |
| splits6 | |
| # In[99]: | |
| contents6= [doc.page_content for doc in splits6] | |
| metadata6 = [doc.metadata for doc in splits6] | |
| # In[100]: | |
| embeddings6 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits6], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings6) | |
| # In[101]: | |
| ids6 = [str(uuid.uuid4()) for _ in range(len(contents6))] | |
| # In[102]: | |
| data.add( | |
| documents=contents6, | |
| embeddings=embeddings6, | |
| metadatas=metadata6, | |
| ids=ids6 | |
| ) | |
| # In[103]: | |
| append_data(contents6, metadata6, embeddings6) | |
| # In[104]: | |
| df | |
| # In[105]: | |
| # page_url = "https://fsm.rnu.tn/useruploads/files/au2425/NV%20ICCR.pdf" | |
| # loader = PyPDFLoader(page_url) | |
| # applications_guidelines_indian = [] | |
| # async for doc in loader.alazy_load(): | |
| # applications_guidelines_indian.append(doc) | |
| # In[106]: | |
| # applications_guidelines_indian | |
| # In[107]: | |
| # documents6 | |
| # In[108]: | |
| # pip install "unstructured[pdf]" | |
| # # <p style="color: orange;"> Document 6 Règlement intérieur des examens</p> | |
| # In[110]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/pages/346/R%C3%A8glement-int%C3%A9rieur-des-examens",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| Règlement_intérieur_des_examens = loader.load() | |
| # In[111]: | |
| Règlement_intérieur_des_examens = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in Règlement_intérieur_des_examens] | |
| Règlement_intérieur_des_examens | |
| # ## splitting doc 6 into chunks | |
| # In[113]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits7 = text_splitter.split_documents( Règlement_intérieur_des_examens) | |
| # In[114]: | |
| splits7 | |
| # In[115]: | |
| contents7= [doc.page_content for doc in splits7] | |
| metadata7 = [doc.metadata for doc in splits7] | |
| # In[116]: | |
| embeddings7 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits7], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings7) | |
| # In[117]: | |
| ids7 = [str(uuid.uuid4()) for _ in range(len(contents7))] | |
| # In[118]: | |
| data.add( | |
| documents=contents7, | |
| embeddings=embeddings7, | |
| metadatas=metadata7, | |
| ids=ids7 | |
| ) | |
| # In[119]: | |
| append_data(contents7, metadata7, embeddings7) | |
| # In[120]: | |
| df | |
| # # <p style="color: orange;">Document 7 Gestion des Stages & PFE (CPE-BR-01-00)</p> | |
| # In[122]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/pages/73/Stages-&-PFE",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| Stages_PFE = loader.load() | |
| # In[123]: | |
| Stages_PFE = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in Stages_PFE] | |
| Stages_PFE | |
| # ## splitting doc 7 into chunks | |
| # In[125]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits8 = text_splitter.split_documents( Stages_PFE) | |
| # In[126]: | |
| splits8 | |
| # In[127]: | |
| contents8= [doc.page_content for doc in splits8] | |
| metadata8 = [doc.metadata for doc in splits8] | |
| # In[128]: | |
| embeddings8= embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits8], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings8) | |
| # In[129]: | |
| ids8 = [str(uuid.uuid4()) for _ in range(len(contents8))] | |
| # In[130]: | |
| data.add( | |
| documents=contents8, | |
| embeddings=embeddings8, | |
| metadatas=metadata8, | |
| ids=ids8 | |
| ) | |
| # In[131]: | |
| append_data(contents8, metadata8, embeddings8) | |
| # In[132]: | |
| df | |
| # # <p style="color: orange;">Document 8 Procédure de déroulement des stages facultatifs (CPE-IN-01-00)</p> | |
| # In[134]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/pages/437/Proc%C3%A9dure-de-d%C3%A9roulement-des-stages-facultatif",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| Procédure_de_déroulement_des_stages_facultatifs = loader.load() | |
| # In[135]: | |
| Procédure_de_déroulement_des_stages_facultatifs = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in Procédure_de_déroulement_des_stages_facultatifs] | |
| Procédure_de_déroulement_des_stages_facultatifs | |
| # ## splitting doc 8 into chunks | |
| # In[137]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits9 = text_splitter.split_documents( Procédure_de_déroulement_des_stages_facultatifs) | |
| # In[138]: | |
| splits9 | |
| # In[139]: | |
| contents9= [doc.page_content for doc in splits9] | |
| metadata9 = [doc.metadata for doc in splits9] | |
| # In[140]: | |
| embeddings9 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits9], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings9) | |
| # In[141]: | |
| ids9 = [str(uuid.uuid4()) for _ in range(len(contents9))] | |
| # In[142]: | |
| data.add( | |
| documents=contents9, | |
| embeddings=embeddings9, | |
| metadatas=metadata9, | |
| ids=ids9 | |
| ) | |
| # In[143]: | |
| append_data(contents9, metadata9, embeddings9) | |
| # In[144]: | |
| df | |
| # # <p style="color: orange;"> Document 9 Procédure de déroulement des stages obligatoires (CPE-IN-02-00)</p> | |
| # In[146]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/pages/75/Proc%C3%A9dure-de-d%C3%A9roulement-des-stages",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| Procédure_de_déroulement_des_stages_obligatoires = loader.load() | |
| # In[147]: | |
| Procédure_de_déroulement_des_stages_obligatoires = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in Procédure_de_déroulement_des_stages_obligatoires] | |
| Procédure_de_déroulement_des_stages_obligatoires | |
| # ## splitting doc 9 into chunks | |
| # In[149]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits10= text_splitter.split_documents(Procédure_de_déroulement_des_stages_obligatoires) | |
| # In[150]: | |
| splits10 | |
| # In[151]: | |
| contents10= [doc.page_content for doc in splits10] | |
| metadata10 = [doc.metadata for doc in splits10] | |
| # In[152]: | |
| embeddings10 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits10], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings10) | |
| # In[153]: | |
| ids10 = [str(uuid.uuid4()) for _ in range(len(contents10))] | |
| # In[154]: | |
| data.add( | |
| documents=contents10, | |
| embeddings=embeddings10, | |
| metadatas=metadata10, | |
| ids=ids10 | |
| ) | |
| # In[155]: | |
| append_data(contents10, metadata10, embeddings10) | |
| # In[156]: | |
| df | |
| # # <p style="color: orange;"> Document 10 Partenariat international</p> | |
| # In[158]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/pages/9/Partenariat-international",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| Partenariat_international = loader.load() | |
| # In[159]: | |
| Partenariat_international = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in Partenariat_international] | |
| Partenariat_international | |
| # ## splitting doc 10 into chunks | |
| # In[161]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits11 = text_splitter.split_documents(Partenariat_international) | |
| # In[162]: | |
| splits11 | |
| # In[163]: | |
| contents11= [doc.page_content for doc in splits11] | |
| metadata11 = [doc.metadata for doc in splits11] | |
| # In[164]: | |
| embeddings11 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits11], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings11) | |
| # In[165]: | |
| ids11 = [str(uuid.uuid4()) for _ in range(len(contents11))] | |
| # In[166]: | |
| data.add( | |
| documents=contents11, | |
| embeddings=embeddings11, | |
| metadatas=metadata11, | |
| ids=ids11 | |
| ) | |
| # In[167]: | |
| append_data(contents11, metadata11, embeddings11) | |
| # In[168]: | |
| df | |
| # # <p style="color: orange;"> Document 11 Communication</p> | |
| # In[170]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/pages/140/Communication",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| Communication = loader.load() | |
| # In[171]: | |
| Communication = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in Communication] | |
| Communication | |
| # ## splitting doc 11 into chunks | |
| # In[173]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits12 = text_splitter.split_documents(Communication) | |
| # In[174]: | |
| splits12 | |
| # In[175]: | |
| contents12= [doc.page_content for doc in splits12] | |
| metadata12 = [doc.metadata for doc in splits12] | |
| # In[176]: | |
| embeddings12 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits12], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings12) | |
| # In[177]: | |
| ids12 = [str(uuid.uuid4()) for _ in range(len(contents12))] | |
| # In[178]: | |
| data.add( | |
| documents=contents12, | |
| embeddings=embeddings12, | |
| metadatas=metadata12, | |
| ids=ids12 | |
| ) | |
| # In[179]: | |
| append_data(contents12, metadata12, embeddings12) | |
| # In[180]: | |
| df | |
| # # <p style="color: orange;"> Document 12 Liens utiles</p> | |
| # In[182]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/links",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("links_container","link_item","link_tags") | |
| ) | |
| ), | |
| ) | |
| Liens_utiles = loader.load() | |
| # In[183]: | |
| Liens_utiles = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in Liens_utiles] | |
| Liens_utiles | |
| # ## splitting doc 12 into chunks | |
| # In[185]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits13 = text_splitter.split_documents(Liens_utiles) | |
| # In[186]: | |
| splits13 | |
| # In[187]: | |
| contents13= [doc.page_content for doc in splits13] | |
| metadata13 = [doc.metadata for doc in splits13] | |
| # In[188]: | |
| embeddings13 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits13], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings13) | |
| # In[189]: | |
| ids13 = [str(uuid.uuid4()) for _ in range(len(contents13))] | |
| # In[190]: | |
| data.add( | |
| documents=contents13, | |
| embeddings=embeddings13, | |
| metadatas=metadata13, | |
| ids=ids13 | |
| ) | |
| # In[191]: | |
| append_data(contents13, metadata13, embeddings13) | |
| # In[192]: | |
| df | |
| # # <p style="color: orange;"> Document 13 Departement Chimie </p> | |
| # In[194]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/departements/CH/4/chimie",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| Chimie = loader.load() | |
| # In[195]: | |
| Chimie = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in Chimie] | |
| Chimie | |
| # ## splitting doc 13 into chunks | |
| # In[197]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits14 = text_splitter.split_documents(Chimie) | |
| # In[198]: | |
| splits14 | |
| # In[199]: | |
| contents14= [doc.page_content for doc in splits14] | |
| metadata14 = [doc.metadata for doc in splits14] | |
| # In[200]: | |
| embeddings14 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits14], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings14) | |
| # In[201]: | |
| ids14 = [str(uuid.uuid4()) for _ in range(len(contents14))] | |
| # In[202]: | |
| data.add( | |
| documents=contents14, | |
| embeddings=embeddings14, | |
| metadatas=metadata14, | |
| ids=ids14 | |
| ) | |
| # In[203]: | |
| append_data(contents14, metadata14, embeddings14) | |
| # In[204]: | |
| df | |
| # # <p style="color: orange;"> Document 14 Departement Mathematique </p> | |
| # In[206]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/departements/M/1/mathematiques",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("selectEnsFilter") | |
| ) | |
| ), | |
| ) | |
| math = loader.load() | |
| # In[207]: | |
| math = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in math] | |
| math | |
| # ## splitting doc 14 into chunks | |
| # In[209]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits15 = text_splitter.split_documents(math) | |
| # In[210]: | |
| splits15 | |
| # In[211]: | |
| contents15= [doc.page_content for doc in splits15] | |
| metadata15 = [doc.metadata for doc in splits15] | |
| # In[212]: | |
| embeddings15 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits15], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings15) | |
| # In[213]: | |
| ids15 = [str(uuid.uuid4()) for _ in range(len(contents15))] | |
| # In[214]: | |
| data.add( | |
| documents=contents15, | |
| embeddings=embeddings15, | |
| metadatas=metadata15, | |
| ids=ids15 | |
| ) | |
| # In[215]: | |
| append_data(contents15, metadata15, embeddings15) | |
| # In[216]: | |
| df | |
| # # <p style="color: orange;"> Document 15 Departement informatique </p> | |
| # In[218]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/departements/Info/2/informatique",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("selectEnsFilter") | |
| ) | |
| ), | |
| ) | |
| info = loader.load() | |
| # In[219]: | |
| info = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in info] | |
| info | |
| # ## splitting doc 15 into chunks | |
| # In[221]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits16=text_splitter.split_documents(info) | |
| # In[222]: | |
| splits16 | |
| # In[223]: | |
| contents16= [doc.page_content for doc in splits16] | |
| metadata16 = [doc.metadata for doc in splits16] | |
| # In[224]: | |
| embeddings16 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits16], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings16) | |
| # In[225]: | |
| ids16 = [str(uuid.uuid4()) for _ in range(len(contents16))] | |
| # In[226]: | |
| data.add( | |
| documents=contents16, | |
| embeddings=embeddings16, | |
| metadatas=metadata16, | |
| ids=ids16 | |
| ) | |
| # In[227]: | |
| append_data(contents16, metadata16, embeddings16) | |
| # In[228]: | |
| df | |
| # # <p style="color: orange;">Document 16 departement Physique </p> | |
| # # Document 16 Departement 16 | |
| # In[231]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/departements/PH/3/physique",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("selectEnsFilter") | |
| ) | |
| ), | |
| ) | |
| physique = loader.load() | |
| # In[232]: | |
| physique = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in physique] | |
| physique | |
| # ## splitting doc 16 into chunks | |
| # In[234]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits17 = text_splitter.split_documents(physique) | |
| # In[235]: | |
| splits17 | |
| # In[236]: | |
| contents17= [doc.page_content for doc in splits17] | |
| metadata17 = [doc.metadata for doc in splits17] | |
| # In[237]: | |
| embeddings17 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits17], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings17) | |
| # In[238]: | |
| ids17 = [str(uuid.uuid4()) for _ in range(len(contents17))] | |
| # In[239]: | |
| data.add( | |
| documents=contents17, | |
| embeddings=embeddings17, | |
| metadatas=metadata17, | |
| ids=ids17 | |
| ) | |
| # In[240]: | |
| append_data(contents17, metadata17, embeddings17) | |
| # In[241]: | |
| df | |
| # # <p style="color: orange;">Document 17 Enseignement Tronc Commun </p> | |
| # In[243]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/departements/ET/5/enseignement-tronc-commun",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| Enseignement_Tronc_Commun = loader.load() | |
| # In[244]: | |
| Enseignement_Tronc_Commun = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in Enseignement_Tronc_Commun] | |
| Enseignement_Tronc_Commun | |
| # ## splitting doc 17 into chunks | |
| # In[246]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits18 = text_splitter.split_documents(Enseignement_Tronc_Commun) | |
| # In[247]: | |
| splits18 | |
| # In[248]: | |
| contents18= [doc.page_content for doc in splits18] | |
| metadata18 = [doc.metadata for doc in splits18] | |
| # In[249]: | |
| embeddings18 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits18], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings18) | |
| # In[250]: | |
| ids18 = [str(uuid.uuid4()) for _ in range(len(contents18))] | |
| # In[251]: | |
| data.add( | |
| documents=contents18, | |
| embeddings=embeddings18, | |
| metadatas=metadata18, | |
| ids=ids18 | |
| ) | |
| # In[252]: | |
| append_data(contents18, metadata18, embeddings18) | |
| # In[253]: | |
| df | |
| # # <p style="color: orange;">Document 18 اخر بلاغ للتسجيل بالنسبة للسنة الجامعية </p> | |
| # | |
| # In[255]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/articles/4712/%D8%A7%D8%AE%D8%B1-%D8%A8%D9%84%D8%A7%D8%BA-%D9%84%D9%84%D8%AA%D8%B3%D8%AC%D9%8A%D9%84-%D8%A8%D8%A7%D9%84%D9%86%D8%B3%D8%A8%D8%A9-%D9%84%D9%84%D8%B3%D9%86%D8%A9-%D8%A7%D9%84%D8%AC%D8%A7%D9%85%D8%B9%D9%8A%D8%A9-2024-2025",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| ekher_balegh = loader.load() | |
| # In[256]: | |
| ekher_balegh = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in ekher_balegh] | |
| ekher_balegh | |
| # ## splitting doc 18 into chunks | |
| # In[258]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits19 = text_splitter.split_documents(ekher_balegh) | |
| # In[259]: | |
| splits19 | |
| # In[260]: | |
| contents19= [doc.page_content for doc in splits19] | |
| metadata19 = [doc.metadata for doc in splits19] | |
| # In[261]: | |
| embeddings19 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits19], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings19) | |
| # In[262]: | |
| ids19 = [str(uuid.uuid4()) for _ in range(len(contents19))] | |
| # In[263]: | |
| data.add( | |
| documents=contents19, | |
| embeddings=embeddings19, | |
| metadatas=metadata19, | |
| ids=ids19 | |
| ) | |
| # In[264]: | |
| append_data(contents19, metadata19, embeddings19) | |
| # In[265]: | |
| df | |
| # # <p style="color: orange;">Documents 19 Comptes extranet des étudiants 2024-2025 </p> | |
| # | |
| # In[267]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/articles/4673/comptes-extranet-des-etudiants-2024-2025",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| comptes_extranet_des_etudiants = loader.load() | |
| # In[268]: | |
| comptes_extranet_des_etudiants = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in comptes_extranet_des_etudiants] | |
| comptes_extranet_des_etudiants | |
| # ## splitting doc 19 into chunks | |
| # In[270]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits20 = text_splitter.split_documents(comptes_extranet_des_etudiants) | |
| # In[271]: | |
| splits20 | |
| # In[272]: | |
| contents20= [doc.page_content for doc in splits20] | |
| metadata20 = [doc.metadata for doc in splits20] | |
| # In[273]: | |
| embeddings20 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits20], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings20) | |
| # In[274]: | |
| ids20 = [str(uuid.uuid4()) for _ in range(len(contents20))] | |
| # In[275]: | |
| data.add( | |
| documents=contents20, | |
| embeddings=embeddings20, | |
| metadatas=metadata20, | |
| ids=ids20 | |
| ) | |
| # In[276]: | |
| append_data(contents20, metadata20, embeddings20) | |
| # In[277]: | |
| df | |
| # # <p style="color: orange;"> Document 20 بلاغ الترسيم للسنة الجامعية </p> | |
| # | |
| # In[279]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/articles/4395/%D8%A8%D9%84%D8%A7%D8%BA-%D8%A7%D9%84%D8%AA%D8%B1%D8%B3%D9%8A%D9%85-%D9%84%D9%84%D8%B3%D9%86%D8%A9-%D8%A7%D9%84%D8%AC%D8%A7%D9%85%D8%B9%D9%8A%D8%A9-2024-2025",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| balegh_tarsim = loader.load() | |
| # In[280]: | |
| comptes_extranet_des_etudiants = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in balegh_tarsim] | |
| balegh_tarsim | |
| # ## splitting doc 20 into chunks | |
| # In[282]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits21 = text_splitter.split_documents(balegh_tarsim) | |
| # In[283]: | |
| splits21 | |
| # In[284]: | |
| contents21= [doc.page_content for doc in splits21] | |
| metadata21= [doc.metadata for doc in splits21] | |
| # In[285]: | |
| embeddings21= embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits21], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings21) | |
| # In[286]: | |
| ids21 = [str(uuid.uuid4()) for _ in range(len(contents21))] | |
| # In[287]: | |
| data.add( | |
| documents=contents21, | |
| embeddings=embeddings21, | |
| metadatas=metadata21, | |
| ids=ids21 | |
| ) | |
| # In[288]: | |
| append_data(contents21, metadata21, embeddings21) | |
| # In[289]: | |
| df | |
| # # <p style="color: orange;">Document 21 Fiche de renseignements des diplômés </p> | |
| # | |
| # In[291]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/pages/138/Fiche-de-renseignements-des-dipl%C3%B4m%C3%A9s",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| Fiche_de_renseignements_des_diplome = loader.load() | |
| # In[292]: | |
| Fiche_de_renseignements_des_diplome = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in Fiche_de_renseignements_des_diplome] | |
| Fiche_de_renseignements_des_diplome | |
| # ## splitting doc 21 into chunks | |
| # In[294]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits22 = text_splitter.split_documents(Fiche_de_renseignements_des_diplome) | |
| # In[295]: | |
| splits22 | |
| # In[296]: | |
| contents22= [doc.page_content for doc in splits22] | |
| metadata22 = [doc.metadata for doc in splits22] | |
| # In[297]: | |
| embeddings22 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits22], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings22) | |
| # In[298]: | |
| ids22 = [str(uuid.uuid4()) for _ in range(len(contents22))] | |
| # In[299]: | |
| data.add( | |
| documents=contents22, | |
| embeddings=embeddings22, | |
| metadatas=metadata22, | |
| ids=ids22 | |
| ) | |
| # In[300]: | |
| append_data(contents22, metadata22, embeddings22) | |
| # In[301]: | |
| df | |
| # # <p style="color: orange;">Document 22 Loi de creation FSM </p> | |
| # | |
| # In[303]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/pages/1/Loi-de-cr%C3%A9ation",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| loi_de_creation = loader.load() | |
| # In[304]: | |
| loi_de_creation = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in loi_de_creation] | |
| loi_de_creation | |
| # ## splitting doc 22 into chunks | |
| # In[306]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits23 = text_splitter.split_documents(loi_de_creation) | |
| # In[307]: | |
| splits23 | |
| # In[308]: | |
| contents23= [doc.page_content for doc in splits23] | |
| metadata23 = [doc.metadata for doc in splits23] | |
| # In[309]: | |
| embeddings23 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits23], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings23) | |
| # In[310]: | |
| ids23 = [str(uuid.uuid4()) for _ in range(len(contents23))] | |
| # In[311]: | |
| data.add( | |
| documents=contents23, | |
| embeddings=embeddings23, | |
| metadatas=metadata23, | |
| ids=ids23 | |
| ) | |
| # In[312]: | |
| append_data(contents23, metadata23, embeddings23) | |
| # In[313]: | |
| df | |
| # # <p style="color: orange;">Document 23 loi en chiffre </p> | |
| # | |
| # In[315]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/pages/3/En-chiffres",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| loi_en_chiffre = loader.load() | |
| # In[316]: | |
| loi_en_chiffre = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in loi_en_chiffre] | |
| loi_en_chiffre | |
| # ## splitting doc 23 into chunks | |
| # In[318]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits24 = text_splitter.split_documents(loi_en_chiffre) | |
| # In[319]: | |
| splits24 | |
| # In[320]: | |
| contents24= [doc.page_content for doc in splits24] | |
| metadata24 = [doc.metadata for doc in splits24] | |
| # In[321]: | |
| embeddings24 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits24], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings24) | |
| # In[322]: | |
| ids24 = [str(uuid.uuid4()) for _ in range(len(contents24))] | |
| # In[323]: | |
| data.add( | |
| documents=contents24, | |
| embeddings=embeddings24, | |
| metadatas=metadata24, | |
| ids=ids24 | |
| ) | |
| # In[324]: | |
| append_data(contents24, metadata24, embeddings24) | |
| # In[325]: | |
| df | |
| # # LICENCE | |
| # # <p style="color: orange;">Document 24 PARCOURS LMD Mathématiques Appliquées</p> | |
| # | |
| # In[328]: | |
| loader = WebBaseLoader( | |
| web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=ABhRHFxzAmNUZVIoBj4ENQYgX2sBPA==&etab=VjJQYQk7",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("center") | |
| ) | |
| ), | |
| ) | |
| parcours_math_appli = loader.load() | |
| # In[329]: | |
| parcours_math_appli = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in parcours_math_appli] | |
| parcours_math_appli | |
| # ## splitting doc 24 into chunks | |
| # In[331]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits25 = text_splitter.split_documents(parcours_math_appli) | |
| # In[332]: | |
| splits25 | |
| # In[333]: | |
| contents25= [doc.page_content for doc in splits25] | |
| metadata25 = [doc.metadata for doc in splits25] | |
| # In[334]: | |
| embeddings25 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits25], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings25) | |
| # In[335]: | |
| ids25 = [str(uuid.uuid4()) for _ in range(len(contents25))] | |
| # In[336]: | |
| data.add( | |
| documents=contents25, | |
| embeddings=embeddings25, | |
| metadatas=metadata25, | |
| ids=ids25 | |
| ) | |
| # In[337]: | |
| append_data(contents25, metadata25, embeddings25) | |
| # In[338]: | |
| df | |
| # # <p style="color: orange;"> Document 25 parcours lmd Computer Science</p> | |
| # | |
| # In[340]: | |
| loader = WebBaseLoader( | |
| web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=UkpTHlxzUzJXZlctDjJTYFZwDDI=&etab=VjJZaAg6",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("center") | |
| ) | |
| ), | |
| ) | |
| parcours_computer_science = loader.load() | |
| # In[341]: | |
| parcours_computer_science = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in parcours_computer_science] | |
| parcours_computer_science | |
| # ## splitting doc 25 into chunks | |
| # In[343]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits26 = text_splitter.split_documents(parcours_computer_science) | |
| # In[344]: | |
| splits26 | |
| # In[345]: | |
| contents26= [doc.page_content for doc in splits26] | |
| metadata26= [doc.metadata for doc in splits26] | |
| # In[346]: | |
| embeddings26 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits26], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings26) | |
| # In[347]: | |
| ids26 = [str(uuid.uuid4()) for _ in range(len(contents26))] | |
| # In[348]: | |
| data.add( | |
| documents=contents26, | |
| embeddings=embeddings26, | |
| metadatas=metadata26, | |
| ids=ids26 | |
| ) | |
| # In[349]: | |
| append_data(contents26, metadata26, embeddings26) | |
| # In[350]: | |
| df | |
| # # <p style="color: orange;"> Document 26 Parcours LMD Mesures et Instrumentation</p> | |
| # | |
| # In[352]: | |
| loader = WebBaseLoader( | |
| web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=W0NXGlp1UjNWZwN5BzkHMVN1DzsBPA==&etab=BGBYaQw+",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("center") | |
| ) | |
| ), | |
| ) | |
| parcours_Mesures = loader.load() | |
| # In[353]: | |
| parcours_Mesures = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in parcours_Mesures] | |
| parcours_Mesures | |
| # ## spitting doc 26 inti chunks | |
| # In[355]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits27 = text_splitter.split_documents(parcours_Mesures) | |
| # In[356]: | |
| splits27 | |
| # In[357]: | |
| contents27= [doc.page_content for doc in splits27] | |
| metadata27= [doc.metadata for doc in splits27] | |
| # In[358]: | |
| embeddings27 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits27], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings27) | |
| # In[359]: | |
| ids27 = [str(uuid.uuid4()) for _ in range(len(contents27))] | |
| # In[360]: | |
| data.add( | |
| documents=contents27, | |
| embeddings=embeddings27, | |
| metadatas=metadata27, | |
| ids=ids27 | |
| ) | |
| # In[361]: | |
| append_data(contents27, metadata27, embeddings27) | |
| # In[362]: | |
| df | |
| # # <p style="color: orange;">Document 27 Parcours LMD Physique </p> | |
| # | |
| # In[364]: | |
| loader = WebBaseLoader( | |
| web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=W0NZFFp1UjNcbVshDjAENlJ0X2tTbg==&etab=AWUDMl9t",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("center") | |
| ) | |
| ), | |
| ) | |
| parcours_physique = loader.load() | |
| # In[365]: | |
| parcours_physique = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in parcours_physique] | |
| parcours_physique | |
| # ## splitting doc 27 into chunks | |
| # In[367]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits28 = text_splitter.split_documents(parcours_physique) | |
| # In[368]: | |
| splits28 | |
| # In[369]: | |
| contents28= [doc.page_content for doc in splits28] | |
| metadata28= [doc.metadata for doc in splits28] | |
| # In[370]: | |
| embeddings28 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits28], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings28) | |
| # In[371]: | |
| ids28 = [str(uuid.uuid4()) for _ in range(len(contents28))] | |
| # In[372]: | |
| data.add( | |
| documents=contents28, | |
| embeddings=embeddings28, | |
| metadatas=metadata28, | |
| ids=ids28 | |
| ) | |
| # In[373]: | |
| append_data(contents28, metadata28, embeddings28) | |
| # In[374]: | |
| df | |
| # # <p style="color: orange;">Document 28 Parcours LMD chimie </p> | |
| # | |
| # In[376]: | |
| loader = WebBaseLoader( | |
| web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=W0NYFV9wVDVcbQF7BzkKPQQiCz8HOg==&etab=B2NUZQAy",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("center") | |
| ) | |
| ), | |
| ) | |
| parcours_chimie = loader.load() | |
| # In[377]: | |
| parcours_chimie = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in parcours_chimie] | |
| parcours_chimie | |
| # ## splitting doc 28 into chunks | |
| # In[379]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits29= text_splitter.split_documents(parcours_chimie) | |
| # In[380]: | |
| splits29 | |
| # In[381]: | |
| contents29= [doc.page_content for doc in splits29] | |
| metadata29= [doc.metadata for doc in splits29] | |
| # In[382]: | |
| embeddings29 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits29], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings29) | |
| # In[383]: | |
| ids29 = [str(uuid.uuid4()) for _ in range(len(contents29))] | |
| # In[384]: | |
| data.add( | |
| documents=contents29, | |
| embeddings=embeddings29, | |
| metadatas=metadata29, | |
| ids=ids29 | |
| ) | |
| # In[385]: | |
| append_data(contents29, metadata29, embeddings29) | |
| # In[386]: | |
| df | |
| # # <p style="color: orange;"> Document 29 Parcours LMD Physique-Chimie</p> | |
| # | |
| # In[388]: | |
| loader = WebBaseLoader( | |
| web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=Bh4HSlh3VTQGN1ctVWsAMVJ0DjA=&etab=VjJZaA0/",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("center") | |
| ) | |
| ), | |
| ) | |
| parcours_physique_chimie = loader.load() | |
| # In[389]: | |
| parcours_physique_chimie = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in parcours_physique_chimie] | |
| parcours_physique_chimie | |
| # ## splitting doc 29 into chunks | |
| # In[391]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits30= text_splitter.split_documents(parcours_physique_chimie) | |
| # In[392]: | |
| splits30 | |
| # In[393]: | |
| contents30= [doc.page_content for doc in splits30] | |
| metadata30= [doc.metadata for doc in splits30] | |
| # In[394]: | |
| embeddings30 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits30], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings30) | |
| # In[395]: | |
| ids30 = [str(uuid.uuid4()) for _ in range(len(contents30))] | |
| # In[396]: | |
| data.add( | |
| documents=contents30, | |
| embeddings=embeddings30, | |
| metadatas=metadata30, | |
| ids=ids30 | |
| ) | |
| # In[397]: | |
| append_data(contents30, metadata30, embeddings30) | |
| df | |
| loader = WebBaseLoader( | |
| web_paths=("https://fsm.rnu.tn/fra/articles/1249/demande-de-diplomes",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("content") | |
| ) | |
| ), | |
| ) | |
| doc_demande_de_diplome = loader.load() | |
| # In[401]: | |
| doc_demande_de_diplome = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in doc_demande_de_diplome] | |
| doc_demande_de_diplome | |
| # ## splitting doc 30 into chunks | |
| # In[403]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits31 = text_splitter.split_documents(doc_demande_de_diplome) | |
| # In[404]: | |
| splits31 | |
| # In[405]: | |
| contents31= [doc.page_content for doc in splits31] | |
| metadata31= [doc.metadata for doc in splits31] | |
| # In[406]: | |
| embeddings31 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits31], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings31) | |
| # In[407]: | |
| ids31 = [str(uuid.uuid4()) for _ in range(len(contents31))] | |
| # In[408]: | |
| data.add( | |
| documents=contents31, | |
| embeddings=embeddings31, | |
| metadatas=metadata31, | |
| ids=ids31 | |
| ) | |
| # In[409]: | |
| append_data(contents31, metadata31, embeddings31) | |
| # In[410]: | |
| df | |
| # # <p style="color: orange;">Document 31 INFORMATION sur master rechereche mathematique </p> | |
| # | |
| # In[412]: | |
| loader = WebBaseLoader( | |
| web_paths=("https://um.rnu.tn/fr/formations/formation-lmd/master/mat%C3%A8re-de-recherche-en-math%C3%A9matiques-fsm/",), | |
| bs_kwargs=dict( | |
| parse_only=bs4.SoupStrainer( | |
| class_=("single-post-content single-content") | |
| ) | |
| ), | |
| ) | |
| info_supp_mastere_math = loader.load() | |
| # In[413]: | |
| info_supp_mastere_math = [ | |
| Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
| for doc in info_supp_mastere_math] | |
| info_supp_mastere_math | |
| # ## spitting doc 31 into chunks | |
| # In[415]: | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
| splits32 = text_splitter.split_documents(info_supp_mastere_math) | |
| # In[416]: | |
| splits32 | |
| # In[417]: | |
| contents32= [doc.page_content for doc in splits32] | |
| metadata32 = [doc.metadata for doc in splits32] | |
| # In[418]: | |
| embeddings32 = embeddings_model.embed_documents( | |
| [doc.page_content for doc in splits32], | |
| # normalize_embeddings=True, | |
| # batch_size=256, | |
| # show_progress_bar=True | |
| ) | |
| print(embeddings32) | |
| # In[419]: | |
| ids32 = [str(uuid.uuid4()) for _ in range(len(contents32))] | |
| # In[420]: | |
| data.add( | |
| documents=contents32, | |
| embeddings=embeddings32, | |
| metadatas=metadata32, | |
| ids=ids32 | |
| ) | |
| # In[421]: | |
| append_data(contents32, metadata32, embeddings32) | |
| data = data.get(include=['embeddings']) | |
| print(data) | |
| # In[427]: | |
| if 'embeddings' in data: | |
| embeddings_array = np.array(data['embeddings']) | |
| print("Embeddings shape:", embeddings_array.shape) | |
| else: | |
| print("No embeddings found in vectorstore.") | |
| # In[428]: | |
| if embeddings_array.size > 0: | |
| pca = PCA(n_components=2) | |
| embeddings_2d = pca.fit_transform(embeddings_array) | |
| # Plot embeddings | |
| plt.figure(figsize=(8, 6)) | |
| plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.7) | |
| plt.xlabel("PCA 1") | |
| plt.ylabel("PCA 2") | |
| plt.title("2D Visualization of Embeddings") | |
| plt.show() | |
| else: | |
| print("No embeddings available for PCA visualization.") | |
| # # Manully testing to retrive 2st attempt just checking 👌 | |
| # In[430]: | |
| data = chroma_client.get_collection(name="my_dataaaa") | |
| # In[431]: | |
| query_embedding = embeddings_model.embed_query("Quelles sont les documents de stage obligatoire?") | |
| results = data.query( | |
| query_embeddings=[query_embedding], | |
| n_results=50 | |
| ) | |
| # In[432]: | |
| results | |
| # In[783]: | |
| chroma_client = chromadb.PersistentClient(path="chroma_db") | |
| collections = chroma_client.list_collections() | |
| print("Available collections:", collections) | |
| if "my_dataaaa" in collections: | |
| collection = chroma_client.get_collection(name="my_dataaaa") | |
| print(" Successfully loaded collection:", collection) | |
| else: | |
| print("Collection 'my_dataaaa' does not exist.", collections) | |
| embeddings_model = HuggingFaceEmbeddings(model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5") | |
| model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli") | |
| classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
| def detect_intent(text): | |
| result = classifier(text, candidate_labels=["question", "greeting", "small talk", "feedback", "thanks"]) | |
| label = result["labels"][0] | |
| return label.lower() | |
| chroma_db_path = "./chroma_db" | |
| chroma_client = chromadb.PersistentClient(path=chroma_db_path) | |
| data = chroma_client.get_collection(name="my_dataaaa") | |
| vectorstore = Chroma( | |
| collection_name="my_dataaaa", | |
| persist_directory="./chroma_db", | |
| embedding_function=embeddings_model | |
| ) | |
| #Create a retriever from chroma DATASTORE | |
| retriever = vectorstore.as_retriever( | |
| search_type="mmr", | |
| search_kwargs={'k': 6, 'lambda_mult': 0.25} | |
| ) | |
| reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') | |
| def rerank_docs(query, docs, top_k=50): | |
| pairs = [(query, doc.page_content) for doc in docs] | |
| scores = reranker.predict(pairs) | |
| scored_docs = list(zip(docs, scores)) | |
| scored_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True) | |
| top_docs = [doc for doc, score in scored_docs[:top_k]] | |
| return top_docs | |
| custom_prompt = PromptTemplate.from_template(""" | |
| You are a helpful assistant answering student questions based ONLY on the provided context. | |
| You must read the entire context carefully and include all relevant information in your answer. | |
| If multiple documents or requirements are mentioned, list them all clearly and completely. | |
| If the answer is not found in the context, respond with: "je ne trouve pas la réponse." | |
| Do not use your own knowledge for university-related questions. Only use what is in the context. | |
| Structure the answer clearly and completely. Do not make any assumptions if the context does not have the answer. | |
| Context: | |
| {context} | |
| Question: | |
| {question} | |
| Answer: | |
| """) | |
| llm = ChatOpenAI(model="gpt-3.5-turbo") | |
| def format_docs(docs): | |
| return "\n\n".join(doc.page_content for doc in docs) | |
| context = format_docs(docs) | |
| context | |
| rag_chain = ( | |
| { | |
| "context": retriever | |
| | (lambda docs: rerank_docs(docs=docs, query="{question}")) | |
| | format_docs, | |
| "question": RunnablePassthrough() | |
| } | |
| | custom_prompt | |
| | llm | |
| | StrOutputParser() | |
| ) | |
| PENDING_QUESTIONS_FILE = "pending_questions.json" | |
| def store_pending_question(user_email, question): | |
| q_id = str(uuid.uuid4()) | |
| pending = { | |
| "id": q_id, | |
| "timestamp": datetime.utcnow().isoformat(), | |
| "user_email": user_email, | |
| "question": question | |
| } | |
| if os.path.exists(PENDING_QUESTIONS_FILE): | |
| with open(PENDING_QUESTIONS_FILE, "r") as f: | |
| data = json.load(f) | |
| else: | |
| data = [] | |
| data.append(pending) | |
| with open(PENDING_QUESTIONS_FILE, "w") as f: | |
| json.dump(data, f, indent=4) | |
| return q_id | |
| def send_question_to_admin(user_email, user_question,question_id): | |
| admin_email = "[email protected]" | |
| smtp_server = "smtp.gmail.com" | |
| smtp_port = 587 | |
| sender_email = "[email protected]" | |
| sender_password = os.getenv("BOT_EMAIL_PASSWORD") | |
| subject = f"Nouvelle question [{question_id}] " | |
| body = ( | |
| f"Question ID: {question_id}\n" | |
| f"Question posée :\n\n{user_question}" | |
| ) | |
| message = MIMEMultipart() | |
| message["From"] = sender_email | |
| message["To"] = admin_email | |
| message["Reply-To"] = "[email protected]" | |
| message["Subject"] = subject | |
| message.attach(MIMEText(body, "plain")) | |
| try: | |
| with smtplib.SMTP(smtp_server, smtp_port) as server: | |
| server.starttls() | |
| server.login(sender_email, sender_password) | |
| server.sendmail(sender_email, admin_email, message.as_string()) | |
| return True | |
| except Exception as e: | |
| print("Error sending email:", e) | |
| return False | |
| def university_related(question): | |
| labels = ["university", "general knowledge"] | |
| result = classifier(question, candidate_labels=labels) | |
| top_label = result["labels"][0] | |
| return top_label.lower() == "university" | |
| def uncertain(answer): | |
| uncertain_phrases = [ | |
| "je ne trouve pas la réponse", | |
| "désolé, je ne peux pas vous aider" | |
| ] | |
| return any(phrase in answer.lower() for phrase in uncertain_phrases) or answer.strip() == "" | |
| def handle_user_query(question, user_email=None): | |
| # using the classifier model | |
| intent = detect_intent(question.lower()) | |
| if intent in ["greeting", "small talk"]: | |
| return "Salut 👋 ! Posez-moi une question précise sur les procédures universitaires 😊." | |
| if not university_related(question): | |
| return "Merci de poser une question sur les procédures universitaires 😊" | |
| # integration de RAG Pipeline | |
| answer = rag_chain.invoke(question) | |
| # making the llama know what to do if there are no relevant docs | |
| if uncertain(answer): | |
| if not user_email: | |
| return ( | |
| "Je ne trouve pas la réponse à cette question. " | |
| "Veuillez me fournir votre adresse e-mail et la question en français pour que je puisse la transmettre à un administrateur.") | |
| q_id = store_pending_question(user_email, question) | |
| sent = send_question_to_admin(user_email, question, q_id) | |
| if sent: | |
| return "Votre question a été transmise à l'administration. Vous recevrez une réponse par e-mail dès que possible." | |
| else: | |
| return "Une erreur est survenue lors de l'envoi de votre question. Veuillez réessayer plus tard." | |
| else: | |
| return answer | |
| user_email = "" | |
| def chatbot_fn(message, history): | |
| global user_email | |
| if not user_email: | |
| if "@gmail.com" in message or "@fsm.rnu.tn" in message: | |
| user_email = message | |
| return "Merci ! Maintenant, posez-moi votre question 😊" | |
| else: | |
| return "Bienvenue 👋 Veuillez entrer votre adresse e-mail pour commencer." | |
| return handle_user_query(message, user_email) | |
| with gr.Blocks() as chat: | |
| gr.ChatInterface( | |
| fn=chatbot_fn, | |
| title="Chatbot Universitaire 🤖 🧠", | |
| description="Commencez par entrer votre adresse e-mail. Ensuite, posez toutes vos questions sur les procédures universitaires !", | |
| examples=[ | |
| ["Comment faire une demande de réinscription ?"], | |
| ["Quels sont les délais pour la soutenance ?"] | |
| ], | |
| submit_btn="Envoyer" | |
| ) | |
| gr.Markdown("© 2025 Esra Belhassen. All rights reserved") | |
| chat.launch(share=True) | |