Spaces:

vicky4s4s
/

News-Summarizer

Sleeping

App Files Files Community

vicky4s4s commited on Mar 22

Commit

51b2aac

verified ·

1 Parent(s): 7ad1177

Upload 3 files

Browse files

Files changed (3) hide show

agents.py +85 -0
app.py +142 -0
utils.py +78 -0

agents.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from api import Model
+import os
+def initial_prompt(list_of_articles, POSITIVE, NEGATIVE,cleaned_text):
+    return f"""
+You are a senior data analyst with 15+ years of expertise in comparative analysis. Your task is to analyze multiple articles from the given list and generate a structured **JSON summary** that highlights key differences, patterns, and sentiment insights.
+### **Input Data:**
+- **Articles:** {list_of_articles}
+### Complete contents of the articles:
+{cleaned_text}
+### **Task Requirements:**
+1. **Identify Key Differences:**
+   - Highlight the **strongest contrasting viewpoints** or **highest topic overlap** among the articles.
+2. **Generate a Structured JSON Output:**
+   - **Coverage Differences:** Identify variations in focus across articles.
+   - **Impact Analysis:** Explain how these differences affect audience perception.
+3. **Perform Sentiment Analysis:**
+   - Generate a summary based on the sentiment distribution.
+### **Sentiment Overview:**
+- **Positive Mentions:** {POSITIVE}
+- **Negative Mentions:** {NEGATIVE}
+"""
+prompt1 = """
+### **Expected JSON Output Format:**
+{
+  "Coverage Differences": [
+    {
+      "Comparison": "{Article_X} focuses on {Key_Topic_X}, while {Article_Y} highlights {Key_Topic_Y}.",
+      "Impact": "{Effect_of_the_different_focuses_on_audience_perception}."
+    },
+    {
+      "Comparison": "{Article_X} emphasizes {Aspect_X}, whereas {Article_Y} discusses {Aspect_Y}.",
+      "Impact": "{Potential_market_or_public_reaction}."
+    },
+    {
+      "Comparison": "{Article_X} presents {Perspective_X}, but {Article_Y} contrasts this with {Perspective_Y}.",
+      "Impact": "{Implications_of_the_conflicting_perspectives}."
+    }
+  ],
+  "Topic Overlap": {
+    "Common Topics": ["{Common_Topic_1}", "{Common_Topic_2}"],
+    "Unique Topics in {Article_X}": ["{Unique_Topic_X_1}", "{Unique_Topic_X_2}"],
+    "Unique Topics in {Article_Y}": ["{Unique_Topic_Y_1}", "{Unique_Topic_Y_2}"]
+  },
+  "Final Sentiment Analysis": "{Overall_sentiment_summary (Positive/Negative/Neutral) with a brief explanation}",
+  "Overall_Sentiment_Ssummarizing_Report" :"Complete contents of the articles content summarizing into give clean format."
+}
+### **Instructions:**
+- **Strictly** follow this JSON structure in your output.
+- Do **not** include any additional text besides the JSON.
+"""
+def generate_response(list_of_articles,POSITIVE,NEGATIVE,clean_text):
+    prompt = initial_prompt(list_of_articles,POSITIVE,NEGATIVE,clean_text) + "\n\n"+ prompt1
+    answer_text= Model.OPENAI_MODEL(prompt)
+    start_index_square = answer_text.find('[')
+    start_index_curly = answer_text.find('{')
+    if start_index_square != -1 and (start_index_curly == -1 or start_index_square < start_index_curly):
+        start_index = start_index_square
+        end_char = ']'
+    elif start_index_curly != -1 and (start_index_square == -1 or start_index_curly < start_index_square):
+        start_index = start_index_curly
+        end_char = '}'
+    else:
+        return ("Error: JSON data not found.")
+        extracted_json = None
+    end_index = answer_text.rfind(end_char)
+    if start_index != -1 and end_index != -1:
+        extracted_json = answer_text[start_index:end_index + 1]
+        return (extracted_json)

app.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import json
+import spacy
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+nlp = spacy.load("en_core_web_sm")
+from transformers import pipeline,set_seed
+from utils import extract_data
+from agents import generate_response
+from gtts import gTTS
+import gradio as gr
+import os
+import asyncio
+from googletrans import Translator
+set_seed(42)
+def eng_to_hindi(text):
+    translator = Translator()
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    translated_text = loop.run_until_complete(translator.translate(text, src="en", dest="hi"))
+    return translated_text.text
+def text_to_voice(text,complete_text):
+    output_audio = r"assets/output.mp3"
+    output_text = r"assets/output.txt"
+    hindi_text = eng_to_hindi(text)
+    tts = gTTS(text=hindi_text, lang="hi")
+    tts.save(output_audio)
+    with open(output_text, "w", encoding="utf-8") as f:
+        f.write(complete_text)
+    return output_audio, output_text
+def compare_articles(articles):
+    docs = [nlp(article) for article in articles]
+    entities_list = [set(doc.ents) for doc in docs]
+    keywords_list = [set(chunk.text.lower() for chunk in doc.noun_chunks) for doc in docs]
+    topic_overlap = {
+        f"Article {i + 1} & Article {j + 1}": {
+            "Common Topics": list(keywords_list[i] & keywords_list[j]),
+            "Unique to Article {}".format(i + 1): list(keywords_list[i] - keywords_list[j]),
+            "Unique to Article {}".format(j + 1): list(keywords_list[j] - keywords_list[i])
+        }
+        for i in range(len(articles))
+        for j in range(i + 1, len(articles))
+    }
+    vectorizer = TfidfVectorizer().fit_transform(articles)
+    similarity_matrix = cosine_similarity(vectorizer)
+    similarity_scores = {
+        f"Article {i + 1} & Article {j + 1}": similarity_matrix[i][j]
+        for i in range(len(articles))
+        for j in range(i + 1, len(articles))
+    }
+    output = {
+        "Topic Overlap": topic_overlap,
+        "Similarity Scores": similarity_scores
+    }
+    return output
+def sentiment_analysis(input_text):
+    model_id = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
+    sentiment_pipeline = pipeline(
+        "sentiment-analysis",
+        model=model_id,
+        tokenizer=model_id,
+    )
+    data = extract_data(input_text)
+    sentiment_counts = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
+    summary_list = []
+    all_articles = []
+    for sublist in data:
+        for item in sublist:
+            summary_text = item['summary']
+            summary_list.append(summary_text)
+            results = sentiment_pipeline(summary_text)
+            sentiment_label = results[0]['label'].upper()
+            sentiment_counts[sentiment_label] += 1
+            all_articles.append({
+                "Title": item['title'],
+                "Summary": summary_text,
+                "Sentiment": sentiment_label,
+                "Topics": item['topics']
+            })
+    comparison_result1 = compare_articles(summary_list)
+    clean_text = ""
+    for item in summary_list:
+        clean_text += item + " \n"
+    response = generate_response(summary_list,
+                                 sentiment_counts["POSITIVE"],
+                                 sentiment_counts["NEGATIVE"],
+                                 clean_text)
+    response_dict = json.loads(response)
+    coverage_differences = response_dict.get("Coverage Differences", [])
+    Topic_Overlap = response_dict.get("Topic Overlap", [])
+    Final_Sentiment_Analysis = response_dict.get("Final Sentiment Analysis", [])
+    summarizing_report = response_dict.get("Overall_Sentiment_Ssummarizing_Report", [])
+    final_output = {
+        "Company": input_text,
+        "Articles": all_articles,
+        "Comparative Sentiment Score": {
+            "Sentiment Distribution": {
+                "Positive": sentiment_counts["POSITIVE"],
+                "Negative": sentiment_counts["NEGATIVE"],
+            }
+        },
+        "Coverage Differences": coverage_differences,
+        "Topic Overlap":Topic_Overlap,
+        "Final Sentiment Analysis": Final_Sentiment_Analysis,
+        "Overall sentiment summarizing report": summarizing_report
+    }
+    return final_output
+def main(input_text):
+    final_answer = sentiment_analysis(input_text)
+    clean_text = json.dumps(final_answer, indent=4)
+    output_audio, output_text = text_to_voice(final_answer["Overall sentiment summarizing report"],clean_text)
+    return output_audio, output_text
+interface = gr.Interface(
+    fn=main,
+    inputs=gr.Textbox(label="Enter the input"),
+    outputs=[
+        gr.Audio(label="Hindi Audio Output"),
+        gr.File(label="complete summarization report")
+    ],
+    title="News Summarizer",
+    description="Enter text in English, and get a pure Hindi speech output along with a downloadable text file."
+)
+interface.launch()

utils.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from newspaper import Article
+from bs4 import BeautifulSoup
+import nltk
+import requests
+nltk.download('punkt')
+def generate_related_urls1(title):
+    from duckduckgo_search import DDGS
+    num_results = 11
+    with DDGS() as ddgs:
+        results = ddgs.text(title, max_results=num_results)
+        return [result["href"] for result in results]
+def generate_related_urls(title):
+    """
+    :param title: str
+    :param num_results: int
+    :return: list
+    """
+    from googlesearch import search
+    urls_list = []
+    num_results = 11
+    for url in search(title, num_results=num_results):
+        if url.startswith("https") and "google.com/search" not in url:
+            urls_list.append(url)
+    return urls_list
+def extract_data(title):
+    """
+    :param title: str
+    :param max_articles:int
+    :return: dict
+    """
+    urls_list = generate_related_urls(title)
+    articles_data = []
+    for url in urls_list[:11]:
+        print(f"Processing URL: {url}")
+        try:
+            response = requests.get(url, timeout=10)
+            if response.status_code == 200:
+                print(f"Success: {url}\n")
+                response = requests.get(url)
+                html = response.text
+                soup = BeautifulSoup(html, "html.parser")
+                h1_tags = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
+                topics_list = []
+                for h1 in h1_tags:
+                    topics_list.append(h1.text.strip())
+                article = Article(url, language="en")
+                article.download()
+                article.parse()
+                article.nlp()
+                article_data =[ {
+                    "url": url,
+                    "title": article.title,
+                    "text": article.text,
+                    "authors": article.authors,
+                    "published_date": str(article.publish_date) if article.publish_date else "Unknown",
+                    "top_image": article.top_image,
+                    "videos": article.movies,
+                    "keywords": article.keywords,
+                    "summary": article.summary,
+                    "topics": topics_list
+                }]
+                articles_data.append(article_data)
+            elif response.status_code == 404:
+                print(f"Error: 404 Not Found - {url}\n")
+            elif response.status_code == 403:
+                print(f"Error: 403 Forbidden - {url}. Access Denied.\n")
+        except Exception as e:
+            print(f"Failed to process {url}: {str(e)}\n")
+        finally:
+            print("=" * 50 + "\n")
+    return articles_data