vicky4s4s commited on
Commit
51b2aac
·
verified ·
1 Parent(s): 7ad1177

Upload 3 files

Browse files
Files changed (3) hide show
  1. agents.py +85 -0
  2. app.py +142 -0
  3. utils.py +78 -0
agents.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from api import Model
2
+ import os
3
+ def initial_prompt(list_of_articles, POSITIVE, NEGATIVE,cleaned_text):
4
+ return f"""
5
+ You are a senior data analyst with 15+ years of expertise in comparative analysis. Your task is to analyze multiple articles from the given list and generate a structured **JSON summary** that highlights key differences, patterns, and sentiment insights.
6
+
7
+ ### **Input Data:**
8
+ - **Articles:** {list_of_articles}
9
+
10
+
11
+ ### Complete contents of the articles:
12
+ {cleaned_text}
13
+
14
+ ### **Task Requirements:**
15
+ 1. **Identify Key Differences:**
16
+ - Highlight the **strongest contrasting viewpoints** or **highest topic overlap** among the articles.
17
+
18
+ 2. **Generate a Structured JSON Output:**
19
+ - **Coverage Differences:** Identify variations in focus across articles.
20
+ - **Impact Analysis:** Explain how these differences affect audience perception.
21
+
22
+ 3. **Perform Sentiment Analysis:**
23
+ - Generate a summary based on the sentiment distribution.
24
+
25
+
26
+ ### **Sentiment Overview:**
27
+ - **Positive Mentions:** {POSITIVE}
28
+ - **Negative Mentions:** {NEGATIVE}
29
+
30
+ """
31
+
32
+ prompt1 = """
33
+ ### **Expected JSON Output Format:**
34
+ {
35
+ "Coverage Differences": [
36
+ {
37
+ "Comparison": "{Article_X} focuses on {Key_Topic_X}, while {Article_Y} highlights {Key_Topic_Y}.",
38
+ "Impact": "{Effect_of_the_different_focuses_on_audience_perception}."
39
+ },
40
+ {
41
+ "Comparison": "{Article_X} emphasizes {Aspect_X}, whereas {Article_Y} discusses {Aspect_Y}.",
42
+ "Impact": "{Potential_market_or_public_reaction}."
43
+ },
44
+ {
45
+ "Comparison": "{Article_X} presents {Perspective_X}, but {Article_Y} contrasts this with {Perspective_Y}.",
46
+ "Impact": "{Implications_of_the_conflicting_perspectives}."
47
+ }
48
+ ],
49
+ "Topic Overlap": {
50
+ "Common Topics": ["{Common_Topic_1}", "{Common_Topic_2}"],
51
+ "Unique Topics in {Article_X}": ["{Unique_Topic_X_1}", "{Unique_Topic_X_2}"],
52
+ "Unique Topics in {Article_Y}": ["{Unique_Topic_Y_1}", "{Unique_Topic_Y_2}"]
53
+ },
54
+ "Final Sentiment Analysis": "{Overall_sentiment_summary (Positive/Negative/Neutral) with a brief explanation}",
55
+ "Overall_Sentiment_Ssummarizing_Report" :"Complete contents of the articles content summarizing into give clean format."
56
+ }
57
+
58
+ ### **Instructions:**
59
+ - **Strictly** follow this JSON structure in your output.
60
+ - Do **not** include any additional text besides the JSON.
61
+ """
62
+
63
+
64
+
65
+ def generate_response(list_of_articles,POSITIVE,NEGATIVE,clean_text):
66
+ prompt = initial_prompt(list_of_articles,POSITIVE,NEGATIVE,clean_text) + "\n\n"+ prompt1
67
+ answer_text= Model.OPENAI_MODEL(prompt)
68
+ start_index_square = answer_text.find('[')
69
+ start_index_curly = answer_text.find('{')
70
+ if start_index_square != -1 and (start_index_curly == -1 or start_index_square < start_index_curly):
71
+ start_index = start_index_square
72
+ end_char = ']'
73
+ elif start_index_curly != -1 and (start_index_square == -1 or start_index_curly < start_index_square):
74
+ start_index = start_index_curly
75
+ end_char = '}'
76
+ else:
77
+ return ("Error: JSON data not found.")
78
+ extracted_json = None
79
+ end_index = answer_text.rfind(end_char)
80
+ if start_index != -1 and end_index != -1:
81
+ extracted_json = answer_text[start_index:end_index + 1]
82
+ return (extracted_json)
83
+
84
+
85
+
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import spacy
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ nlp = spacy.load("en_core_web_sm")
6
+ from transformers import pipeline,set_seed
7
+ from utils import extract_data
8
+ from agents import generate_response
9
+ from gtts import gTTS
10
+ import gradio as gr
11
+ import os
12
+ import asyncio
13
+ from googletrans import Translator
14
+
15
+ set_seed(42)
16
+
17
+ def eng_to_hindi(text):
18
+ translator = Translator()
19
+ loop = asyncio.new_event_loop()
20
+ asyncio.set_event_loop(loop)
21
+ translated_text = loop.run_until_complete(translator.translate(text, src="en", dest="hi"))
22
+ return translated_text.text
23
+
24
+ def text_to_voice(text,complete_text):
25
+ output_audio = r"assets/output.mp3"
26
+ output_text = r"assets/output.txt"
27
+ hindi_text = eng_to_hindi(text)
28
+ tts = gTTS(text=hindi_text, lang="hi")
29
+ tts.save(output_audio)
30
+ with open(output_text, "w", encoding="utf-8") as f:
31
+ f.write(complete_text)
32
+ return output_audio, output_text
33
+
34
+ def compare_articles(articles):
35
+ docs = [nlp(article) for article in articles]
36
+ entities_list = [set(doc.ents) for doc in docs]
37
+ keywords_list = [set(chunk.text.lower() for chunk in doc.noun_chunks) for doc in docs]
38
+ topic_overlap = {
39
+ f"Article {i + 1} & Article {j + 1}": {
40
+ "Common Topics": list(keywords_list[i] & keywords_list[j]),
41
+ "Unique to Article {}".format(i + 1): list(keywords_list[i] - keywords_list[j]),
42
+ "Unique to Article {}".format(j + 1): list(keywords_list[j] - keywords_list[i])
43
+ }
44
+ for i in range(len(articles))
45
+ for j in range(i + 1, len(articles))
46
+ }
47
+
48
+ vectorizer = TfidfVectorizer().fit_transform(articles)
49
+ similarity_matrix = cosine_similarity(vectorizer)
50
+
51
+ similarity_scores = {
52
+ f"Article {i + 1} & Article {j + 1}": similarity_matrix[i][j]
53
+ for i in range(len(articles))
54
+ for j in range(i + 1, len(articles))
55
+ }
56
+
57
+ output = {
58
+ "Topic Overlap": topic_overlap,
59
+ "Similarity Scores": similarity_scores
60
+ }
61
+
62
+ return output
63
+
64
+
65
+ def sentiment_analysis(input_text):
66
+ model_id = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
67
+ sentiment_pipeline = pipeline(
68
+ "sentiment-analysis",
69
+ model=model_id,
70
+ tokenizer=model_id,
71
+ )
72
+ data = extract_data(input_text)
73
+ sentiment_counts = {"POSITIVE": 0, "NEGATIVE": 0, "NEUTRAL": 0}
74
+ summary_list = []
75
+ all_articles = []
76
+ for sublist in data:
77
+ for item in sublist:
78
+ summary_text = item['summary']
79
+ summary_list.append(summary_text)
80
+ results = sentiment_pipeline(summary_text)
81
+ sentiment_label = results[0]['label'].upper()
82
+ sentiment_counts[sentiment_label] += 1
83
+ all_articles.append({
84
+ "Title": item['title'],
85
+ "Summary": summary_text,
86
+ "Sentiment": sentiment_label,
87
+ "Topics": item['topics']
88
+ })
89
+
90
+ comparison_result1 = compare_articles(summary_list)
91
+ clean_text = ""
92
+ for item in summary_list:
93
+ clean_text += item + " \n"
94
+ response = generate_response(summary_list,
95
+ sentiment_counts["POSITIVE"],
96
+ sentiment_counts["NEGATIVE"],
97
+ clean_text)
98
+ response_dict = json.loads(response)
99
+ coverage_differences = response_dict.get("Coverage Differences", [])
100
+ Topic_Overlap = response_dict.get("Topic Overlap", [])
101
+ Final_Sentiment_Analysis = response_dict.get("Final Sentiment Analysis", [])
102
+ summarizing_report = response_dict.get("Overall_Sentiment_Ssummarizing_Report", [])
103
+ final_output = {
104
+ "Company": input_text,
105
+ "Articles": all_articles,
106
+ "Comparative Sentiment Score": {
107
+ "Sentiment Distribution": {
108
+ "Positive": sentiment_counts["POSITIVE"],
109
+ "Negative": sentiment_counts["NEGATIVE"],
110
+ }
111
+ },
112
+ "Coverage Differences": coverage_differences,
113
+ "Topic Overlap":Topic_Overlap,
114
+ "Final Sentiment Analysis": Final_Sentiment_Analysis,
115
+ "Overall sentiment summarizing report": summarizing_report
116
+ }
117
+
118
+ return final_output
119
+
120
+
121
+ def main(input_text):
122
+ final_answer = sentiment_analysis(input_text)
123
+ clean_text = json.dumps(final_answer, indent=4)
124
+ output_audio, output_text = text_to_voice(final_answer["Overall sentiment summarizing report"],clean_text)
125
+ return output_audio, output_text
126
+
127
+ interface = gr.Interface(
128
+ fn=main,
129
+ inputs=gr.Textbox(label="Enter the input"),
130
+ outputs=[
131
+ gr.Audio(label="Hindi Audio Output"),
132
+ gr.File(label="complete summarization report")
133
+ ],
134
+ title="News Summarizer",
135
+ description="Enter text in English, and get a pure Hindi speech output along with a downloadable text file."
136
+ )
137
+
138
+ interface.launch()
139
+
140
+
141
+
142
+
utils.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from newspaper import Article
2
+ from bs4 import BeautifulSoup
3
+ import nltk
4
+ import requests
5
+ nltk.download('punkt')
6
+
7
+ def generate_related_urls1(title):
8
+ from duckduckgo_search import DDGS
9
+ num_results = 11
10
+ with DDGS() as ddgs:
11
+ results = ddgs.text(title, max_results=num_results)
12
+ return [result["href"] for result in results]
13
+
14
+ def generate_related_urls(title):
15
+ """
16
+ :param title: str
17
+ :param num_results: int
18
+ :return: list
19
+ """
20
+ from googlesearch import search
21
+ urls_list = []
22
+ num_results = 11
23
+ for url in search(title, num_results=num_results):
24
+ if url.startswith("https") and "google.com/search" not in url:
25
+ urls_list.append(url)
26
+ return urls_list
27
+
28
+ def extract_data(title):
29
+ """
30
+ :param title: str
31
+ :param max_articles:int
32
+ :return: dict
33
+ """
34
+ urls_list = generate_related_urls(title)
35
+ articles_data = []
36
+ for url in urls_list[:11]:
37
+ print(f"Processing URL: {url}")
38
+ try:
39
+ response = requests.get(url, timeout=10)
40
+ if response.status_code == 200:
41
+ print(f"Success: {url}\n")
42
+ response = requests.get(url)
43
+ html = response.text
44
+ soup = BeautifulSoup(html, "html.parser")
45
+ h1_tags = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
46
+ topics_list = []
47
+ for h1 in h1_tags:
48
+ topics_list.append(h1.text.strip())
49
+ article = Article(url, language="en")
50
+ article.download()
51
+ article.parse()
52
+ article.nlp()
53
+ article_data =[ {
54
+ "url": url,
55
+ "title": article.title,
56
+ "text": article.text,
57
+ "authors": article.authors,
58
+ "published_date": str(article.publish_date) if article.publish_date else "Unknown",
59
+ "top_image": article.top_image,
60
+ "videos": article.movies,
61
+ "keywords": article.keywords,
62
+ "summary": article.summary,
63
+ "topics": topics_list
64
+ }]
65
+ articles_data.append(article_data)
66
+
67
+ elif response.status_code == 404:
68
+ print(f"Error: 404 Not Found - {url}\n")
69
+ elif response.status_code == 403:
70
+ print(f"Error: 403 Forbidden - {url}. Access Denied.\n")
71
+ except Exception as e:
72
+ print(f"Failed to process {url}: {str(e)}\n")
73
+ finally:
74
+ print("=" * 50 + "\n")
75
+
76
+ return articles_data
77
+
78
+