Spaces:
Build error
Build error
| import streamlit as st | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline | |
| import spacy | |
| from tika import parser | |
| import requests | |
| import pandas as pd | |
| # Loading spaCy model outside the streamlit cache | |
| nlp = spacy.load("en_core_web_sm") | |
| def load_environmental_model(): | |
| name_env = "ESGBERT/EnvironmentalBERT-environmental" | |
| tokenizer_env = AutoTokenizer.from_pretrained(name_env) | |
| model_env = AutoModelForSequenceClassification.from_pretrained(name_env) | |
| return pipeline("text-classification", model=model_env, tokenizer=tokenizer_env) | |
| def load_social_model(): | |
| name_soc = "ESGBERT/SocialBERT-social" | |
| tokenizer_soc = AutoTokenizer.from_pretrained(name_soc) | |
| model_soc = AutoModelForSequenceClassification.from_pretrained(name_soc) | |
| return pipeline("text-classification", model=model_soc, tokenizer=tokenizer_soc) | |
| def load_governance_model(): | |
| name_gov = "ESGBERT/GovernanceBERT-governance" | |
| tokenizer_gov = AutoTokenizer.from_pretrained(name_gov) | |
| model_gov = AutoModelForSequenceClassification.from_pretrained(name_gov) | |
| return pipeline("text-classification", model=model_gov, tokenizer=tokenizer_gov) | |
| def load_sentiment_model(): | |
| model_name = "climatebert/distilroberta-base-climate-sentiment" | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, max_len=512) | |
| return pipeline("text-classification", model=model, tokenizer=tokenizer) | |
| # Streamlit App | |
| st.title("ESG Report Classification using Natural Language Processing") | |
| # Get report URL from user input | |
| url = st.text_input("Enter the URL of the report (PDF):") | |
| # Model selection dropdown | |
| st.write("Environmental Model, Social Model, Governance Model would give the percentage denoting the parameter chosen.") | |
| st.write("Sentiment Model shows if the company is a risk or opportunity based on all 3 parameters.") | |
| selected_model = st.selectbox("Select Model", ["Environmental Model", "Social Model", "Governance Model", "Sentiment Model"]) | |
| if url: | |
| # Download PDF content from the URL | |
| response = requests.get(url, stream=True) | |
| if response.status_code == 200: | |
| # Parse PDF and extract text | |
| raw_text = parser.from_buffer(response.content)['content'] | |
| # Extract sentences using spaCy | |
| doc = nlp(raw_text) | |
| sentences = [sent.text for sent in doc.sents] | |
| # Filtering and preprocessing sentences | |
| sequences = list(map(str, sentences)) | |
| sentences = [x.replace("\n", "") for x in sequences] | |
| sentences = [x for x in sentences if x != ""] | |
| sentences = [x for x in sentences if x[0].isupper()] | |
| sub_sentences = sentences[:100] | |
| # Classification using different models based on user selection | |
| if selected_model == "Environmental Model": | |
| pipe_model = load_environmental_model() | |
| elif selected_model == "Social Model": | |
| pipe_model = load_social_model() | |
| elif selected_model == "Governance Model": | |
| pipe_model = load_governance_model() | |
| else: | |
| pipe_model = load_sentiment_model() | |
| # Get predictions for the selected model | |
| model_results = pipe_model(sub_sentences, padding=True, truncation=True) | |
| model_labels = [x["label"] for x in model_results] | |
| # Display count of sentences labeled as the selected model | |
| st.subheader(f"{selected_model} Sentences Count") | |
| st.write(pd.DataFrame({"sentence": sub_sentences, selected_model: model_labels}).groupby(selected_model).count()) | |
| else: | |
| st.error("Error fetching PDF content from the provided URL. Please check the URL and try again.") | |