Inara132000 commited on
Commit
f98da18
·
verified ·
1 Parent(s): 4666162

Upload deliverable2.py

Browse files
Files changed (1) hide show
  1. deliverable2.py +186 -0
deliverable2.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled28.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/14xRie8cU3OXbtj4jX0HaEyZUDmy6cgPD
8
+ """
9
+
10
+ import requests
11
+ from bs4 import BeautifulSoup
12
+ from sentence_transformers import SentenceTransformer, util
13
+ from transformers import pipeline
14
+
15
+ class URLValidator:
16
+ """
17
+ A production-ready URL validation class that evaluates the credibility of a webpage
18
+ using multiple factors: domain trust, content relevance, fact-checking, bias detection, and citations.
19
+ """
20
+
21
+ def __init__(self):
22
+ # SerpAPI Key
23
+ # This api key is acquired from SerpAPI website.
24
+ self.serpapi_key = SERPAPI_API_KEY
25
+
26
+ # Load models once to avoid redundant API calls
27
+ self.similarity_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
28
+ self.fake_news_classifier = pipeline("text-classification", model="mrm8488/bert-tiny-finetuned-fake-news-detection")
29
+ self.sentiment_analyzer = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment")
30
+
31
+ def fetch_page_content(self, url: str) -> str:
32
+ """ Fetches and extracts text content from the given URL. """
33
+ try:
34
+ response = requests.get(url, timeout=10)
35
+ response.raise_for_status()
36
+ soup = BeautifulSoup(response.text, "html.parser")
37
+ return " ".join([p.text for p in soup.find_all("p")]) # Extract paragraph text
38
+ except requests.RequestException:
39
+ return "" # Fail gracefully by returning an empty string
40
+
41
+ def get_domain_trust(self, url: str, content: str) -> int:
42
+ """ Computes the domain trust score based on available data sources. """
43
+ trust_scores = []
44
+
45
+ # Hugging Face Fake News Detector
46
+ if content:
47
+ try:
48
+ trust_scores.append(self.get_domain_trust_huggingface(content))
49
+ except:
50
+ pass
51
+
52
+ # Compute final score (average of available scores)
53
+ return int(sum(trust_scores) / len(trust_scores)) if trust_scores else 50
54
+
55
+ def get_domain_trust_huggingface(self, content: str) -> int:
56
+ """ Uses a Hugging Face fake news detection model to assess credibility. """
57
+ if not content:
58
+ return 50 # Default score if no content available
59
+ result = self.fake_news_classifier(content[:512])[0] # Process only first 512 characters
60
+ return 100 if result["label"] == "REAL" else 30 if result["label"] == "FAKE" else 50
61
+
62
+ def compute_similarity_score(self, user_query: str, content: str) -> int:
63
+ """ Computes semantic similarity between user query and page content. """
64
+ if not content:
65
+ return 0
66
+ return int(util.pytorch_cos_sim(self.similarity_model.encode(user_query), self.similarity_model.encode(content)).item() * 100)
67
+
68
+ def check_facts(self, content: str) -> int:
69
+ """ Cross-checks extracted content with Google Fact Check API. """
70
+ if not content:
71
+ return 50
72
+ api_url = f"https://toolbox.google.com/factcheck/api/v1/claimsearch?query={content[:200]}"
73
+ try:
74
+ response = requests.get(api_url)
75
+ data = response.json()
76
+ return 80 if "claims" in data and data["claims"] else 40
77
+ except:
78
+ return 50 # Default uncertainty score
79
+
80
+ def check_google_scholar(self, url: str) -> int:
81
+ """ Checks Google Scholar citations using SerpAPI. """
82
+ serpapi_key = self.serpapi_key
83
+ params = {"q": url, "engine": "google_scholar", "api_key": serpapi_key}
84
+ try:
85
+ response = requests.get("https://serpapi.com/search", params=params)
86
+ data = response.json()
87
+ return min(len(data.get("organic_results", [])) * 10, 100) # Normalize
88
+ except:
89
+ return 0 # Default to no citations
90
+
91
+ def detect_bias(self, content: str) -> int:
92
+ """ Uses NLP sentiment analysis to detect potential bias in content. """
93
+ if not content:
94
+ return 50
95
+ sentiment_result = self.sentiment_analyzer(content[:512])[0]
96
+ return 100 if sentiment_result["label"] == "POSITIVE" else 50 if sentiment_result["label"] == "NEUTRAL" else 30
97
+
98
+ def get_star_rating(self, score: float) -> tuple:
99
+ """ Converts a score (0-100) into a 1-5 star rating. """
100
+ stars = max(1, min(5, round(score / 20))) # Normalize 100-scale to 5-star scale
101
+ return stars, "⭐" * stars
102
+
103
+ def generate_explanation(self, domain_trust, similarity_score, fact_check_score, bias_score, citation_score, final_score) -> str:
104
+ """ Generates a human-readable explanation for the score. """
105
+ reasons = []
106
+ if domain_trust < 50:
107
+ reasons.append("The source has low domain authority.")
108
+ if similarity_score < 50:
109
+ reasons.append("The content is not highly relevant to your query.")
110
+ if fact_check_score < 50:
111
+ reasons.append("Limited fact-checking verification found.")
112
+ if bias_score < 50:
113
+ reasons.append("Potential bias detected in the content.")
114
+ if citation_score < 30:
115
+ reasons.append("Few citations found for this content.")
116
+
117
+ return " ".join(reasons) if reasons else "This source is highly credible and relevant."
118
+
119
+ def rate_url_validity(self, user_query: str, url: str) -> dict:
120
+ """ Main function to evaluate the validity of a webpage. """
121
+ content = self.fetch_page_content(url)
122
+
123
+ domain_trust = self.get_domain_trust(url, content)
124
+ similarity_score = self.compute_similarity_score(user_query, content)
125
+ fact_check_score = self.check_facts(content)
126
+ bias_score = self.detect_bias(content)
127
+ citation_score = self.check_google_scholar(url)
128
+
129
+ final_score = (
130
+ (0.3 * domain_trust) +
131
+ (0.3 * similarity_score) +
132
+ (0.2 * fact_check_score) +
133
+ (0.1 * bias_score) +
134
+ (0.1 * citation_score)
135
+ )
136
+
137
+ stars, icon = self.get_star_rating(final_score)
138
+ explanation = self.generate_explanation(domain_trust, similarity_score, fact_check_score, bias_score, citation_score, final_score)
139
+
140
+ return {
141
+ "raw_score": {
142
+ "Domain Trust": domain_trust,
143
+ "Content Relevance": similarity_score,
144
+ "Fact-Check Score": fact_check_score,
145
+ "Bias Score": bias_score,
146
+ "Citation Score": citation_score,
147
+ "Final Validity Score": final_score
148
+ },
149
+ "stars": {
150
+ "score": stars,
151
+ "icon": icon
152
+ },
153
+ "explanation": explanation
154
+ }
155
+
156
+ queries_urls = [
157
+ ("How blockchain works", "https://www.ibm.com/topics/what-is-blockchain"),
158
+ ("Climate change effects", "https://www.nationalgeographic.com/environment/article/climate-change-overview"),
159
+ ("COVID-19 vaccine effectiveness", "https://www.cdc.gov/coronavirus/2019-ncov/vaccines/effectiveness.html"),
160
+ ("Latest AI advancements", "https://www.technologyreview.com/topic/artificial-intelligence"),
161
+ ("Stock market trends", "https://www.bloomberg.com/markets"),
162
+ ("Healthy diet tips", "https://www.healthline.com/nutrition/healthy-eating-tips"),
163
+ ("Space exploration missions", "https://www.nasa.gov/missions"),
164
+ ("Electric vehicle benefits", "https://www.tesla.com/benefits"),
165
+ ("History of the internet", "https://www.history.com/topics/inventions/history-of-the-internet"),
166
+ ("Nutritional benefits of a vegan diet", "https://www.hsph.harvard.edu/nutritionsource/healthy-weight/diet-reviews/vegan-diet/"),
167
+ ("Mental health awareness", "https://www.who.int/news-room/fact-sheets/detail/mental-health-strengthening-our-response")
168
+ ]
169
+
170
+ # Placeholder function ratings for demonstration
171
+ import random
172
+
173
+ formatted_output = []
174
+
175
+ for query, url in queries_urls:
176
+ output_entry = {
177
+ "Query": query,
178
+ "URL": url,
179
+ "Function Rating": random.randint(1, 5), # Simulated rating
180
+ "Custom Rating": random.randint(1, 5) # Simulated rating
181
+ }
182
+ formatted_output.append(output_entry)
183
+
184
+ # Display the formatted output
185
+ formatted_output
186
+