Fred808 commited on
Commit
891707d
·
verified ·
1 Parent(s): 03cd501

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -75
app.py CHANGED
@@ -2,7 +2,8 @@ import re
2
  import json
3
  import numpy as np
4
  import faiss
5
- from flask import Flask, request, jsonify
 
6
  from transformers import (
7
  pipeline,
8
  AutoModelForSequenceClassification,
@@ -14,8 +15,18 @@ from transformers import (
14
  )
15
  from sentence_transformers import SentenceTransformer
16
  from bertopic import BERTopic
17
- from datasets import load_dataset, Features, Value
18
- import pyarrow.parquet as pq
 
 
 
 
 
 
 
 
 
 
19
 
20
  # Preprocessing function
21
  def preprocess_text(text):
@@ -128,9 +139,6 @@ class Chatbot:
128
  return response
129
 
130
 
131
- # Flask API for Chatbot Integration
132
- app = Flask(__name__)
133
-
134
  # Initialize models
135
  classifier = ContentClassifier()
136
  relevance_detector = RelevanceDetector()
@@ -139,93 +147,135 @@ search_engine = SearchEngine()
139
  topic_extractor = TopicExtractor()
140
  chatbot = Chatbot()
141
 
142
- # Load the yt-commons dataset
143
-
144
-
145
- # Define the schema
146
- features = Features({
147
- "video_id": Value("string"),
148
- "video_link": Value("string"),
149
- "title": Value("string"),
150
- "text": Value("string"),
151
- "channel": Value("string"),
152
- "channel_id": Value("string"),
153
- "date": Value("string"),
154
- "license": Value("string"),
155
- "original_language": Value("string"),
156
- "source_language": Value("string"),
157
- "transcription_language": Value("string"),
158
- "word_count": Value("int64"),
159
- "character_count": Value("int64"),
160
- })
161
-
162
- # Load the dataset from Hugging Face Hub
163
- try:
164
- dataset = load_dataset(
165
- "PleIAs/YouTube-Commons",
166
- features=features,
167
- streaming=True,
168
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
- # Process the dataset
171
- for example in dataset["train"]:
172
- print(example) # Process each example
173
- break # Stop after the first example for demonstration
174
- except Exception as e:
175
- print(f"Error loading dataset: {e}")
176
 
177
  # API Endpoints
178
- @app.route("/classify", methods=["POST"])
179
- def classify():
180
- text = request.json.get("text", "")
181
- if not text:
182
- return jsonify({"error": "No text provided"}), 400
183
- result = classifier.classify(text)
184
- return jsonify(result)
185
 
 
 
186
 
187
- @app.route("/relevance", methods=["POST"])
188
- def relevance():
189
- text = request.json.get("text", "")
190
- if not text:
191
- return jsonify({"error": "No text provided"}), 400
192
- relevant = relevance_detector.detect_relevance(text)
193
- return jsonify({"relevant": relevant})
194
 
 
 
 
 
 
 
 
 
 
195
 
196
- @app.route("/summarize", methods=["POST"])
197
- def summarize():
198
- text = request.json.get("text", "")
199
- if not text:
200
- return jsonify({"error": "No text provided"}), 400
201
- summary = summarizer.summarize(text)
202
- return jsonify({"summary": summary})
203
 
 
 
 
 
 
 
204
 
205
- @app.route("/search", methods=["POST"])
206
- def search():
207
- query = request.json.get("query", "")
 
 
 
 
208
  if not query:
209
- return jsonify({"error": "No query provided"}), 400
 
210
  results = search_engine.search(query)
211
- return jsonify({"results": results})
212
 
213
 
214
- @app.route("/topics", methods=["POST"])
215
- def topics():
216
- result = topic_extractor.extract_topics(youtube_data)
217
- return jsonify({"topics": result.to_dict()})
 
 
 
 
218
 
219
 
220
- @app.route("/chat", methods=["POST"])
221
- def chat():
222
- prompt = request.json.get("prompt", "")
223
  if not prompt:
224
- return jsonify({"error": "No prompt provided"}), 400
 
225
  response = chatbot.generate_response(prompt)
226
- return jsonify({"response": response})
227
 
228
 
229
- # Start the Flask API
230
  if __name__ == "__main__":
231
- app.run(debug=True)
 
 
2
  import json
3
  import numpy as np
4
  import faiss
5
+ from fastapi import FastAPI, HTTPException
6
+ from pydantic import BaseModel
7
  from transformers import (
8
  pipeline,
9
  AutoModelForSequenceClassification,
 
15
  )
16
  from sentence_transformers import SentenceTransformer
17
  from bertopic import BERTopic
18
+ from datasets import Features, Value
19
+ from googleapiclient.discovery import build
20
+ from youtube_transcript_api import YouTubeTranscriptApi
21
+
22
+ # Initialize FastAPI app
23
+ app = FastAPI()
24
+
25
+ # YouTube Data API setup
26
+ API_KEY = "your_youtube_api_key"
27
+ YOUTUBE_API_SERVICE_NAME = "youtube"
28
+ YOUTUBE_API_VERSION = "v3"
29
+ youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY)
30
 
31
  # Preprocessing function
32
  def preprocess_text(text):
 
139
  return response
140
 
141
 
 
 
 
142
  # Initialize models
143
  classifier = ContentClassifier()
144
  relevance_detector = RelevanceDetector()
 
147
  topic_extractor = TopicExtractor()
148
  chatbot = Chatbot()
149
 
150
+ # Fetch video metadata using YouTube Data API
151
+ def fetch_video_metadata(video_id):
152
+ request = youtube.videos().list(
153
+ part="snippet,statistics",
154
+ id=video_id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  )
156
+ response = request.execute()
157
+ return response["items"][0] if response["items"] else None
158
+
159
+
160
+ # Fetch video transcript using youtube-transcript-api
161
+ def fetch_video_transcript(video_id):
162
+ try:
163
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
164
+ return " ".join([entry["text"] for entry in transcript])
165
+ except Exception as e:
166
+ print(f"Error fetching transcript: {e}")
167
+ return None
168
+
169
+
170
+ # Fetch and preprocess video data
171
+ def fetch_and_preprocess_video_data(video_id):
172
+ metadata = fetch_video_metadata(video_id)
173
+ if not metadata:
174
+ return None
175
+
176
+ transcript = fetch_video_transcript(video_id)
177
+
178
+ # Preprocess the data
179
+ video_data = {
180
+ "video_id": video_id,
181
+ "video_link": f"https://www.youtube.com/watch?v={video_id}",
182
+ "title": metadata["snippet"]["title"],
183
+ "text": transcript if transcript else metadata["snippet"]["description"],
184
+ "channel": metadata["snippet"]["channelTitle"],
185
+ "channel_id": metadata["snippet"]["channelId"],
186
+ "date": metadata["snippet"]["publishedAt"],
187
+ "license": "Unknown",
188
+ "original_language": "Unknown",
189
+ "source_language": "Unknown",
190
+ "transcription_language": "Unknown",
191
+ "word_count": len(metadata["snippet"]["description"].split()),
192
+ "character_count": len(metadata["snippet"]["description"]),
193
+ }
194
+ return video_data
195
+
196
+
197
+ # Pydantic models for request validation
198
+ class VideoRequest(BaseModel):
199
+ video_id: str
200
+
201
+
202
+ class TextRequest(BaseModel):
203
+ text: str
204
+
205
+
206
+ class QueryRequest(BaseModel):
207
+ query: str
208
+
209
+
210
+ class PromptRequest(BaseModel):
211
+ prompt: str
212
 
 
 
 
 
 
 
213
 
214
  # API Endpoints
215
+ @app.post("/classify")
216
+ async def classify(request: VideoRequest):
217
+ video_id = request.video_id
218
+ video_data = fetch_and_preprocess_video_data(video_id)
219
+ if not video_data:
220
+ raise HTTPException(status_code=400, detail="Failed to fetch video data")
 
221
 
222
+ result = classifier.classify(video_data["text"])
223
+ return {"result": result}
224
 
 
 
 
 
 
 
 
225
 
226
+ @app.post("/relevance")
227
+ async def relevance(request: VideoRequest):
228
+ video_id = request.video_id
229
+ video_data = fetch_and_preprocess_video_data(video_id)
230
+ if not video_data:
231
+ raise HTTPException(status_code=400, detail="Failed to fetch video data")
232
+
233
+ relevant = relevance_detector.detect_relevance(video_data["text"])
234
+ return {"relevant": relevant}
235
 
 
 
 
 
 
 
 
236
 
237
+ @app.post("/summarize")
238
+ async def summarize(request: VideoRequest):
239
+ video_id = request.video_id
240
+ video_data = fetch_and_preprocess_video_data(video_id)
241
+ if not video_data:
242
+ raise HTTPException(status_code=400, detail="Failed to fetch video data")
243
 
244
+ summary = summarizer.summarize(video_data["text"])
245
+ return {"summary": summary}
246
+
247
+
248
+ @app.post("/search")
249
+ async def search(request: QueryRequest):
250
+ query = request.query
251
  if not query:
252
+ raise HTTPException(status_code=400, detail="No query provided")
253
+
254
  results = search_engine.search(query)
255
+ return {"results": results}
256
 
257
 
258
+ @app.post("/topics")
259
+ async def topics(request: TextRequest):
260
+ text = request.text
261
+ if not text:
262
+ raise HTTPException(status_code=400, detail="No text provided")
263
+
264
+ result = topic_extractor.extract_topics([text])
265
+ return {"topics": result.to_dict()}
266
 
267
 
268
+ @app.post("/chat")
269
+ async def chat(request: PromptRequest):
270
+ prompt = request.prompt
271
  if not prompt:
272
+ raise HTTPException(status_code=400, detail="No prompt provided")
273
+
274
  response = chatbot.generate_response(prompt)
275
+ return {"response": response}
276
 
277
 
278
+ # Start the FastAPI app
279
  if __name__ == "__main__":
280
+ import uvicorn
281
+ uvicorn.run(app, host="0.0.0.0", port=8000)