shyameati commited on
Commit
00e837f
·
1 Parent(s): bced053

Your commit message

Browse files
Files changed (2) hide show
  1. app.py +54 -11
  2. hf_push.sh +0 -30
app.py CHANGED
@@ -43,18 +43,15 @@ def load_hf_dataset():
43
  """
44
  global dataset_cache
45
 
46
- # Already loaded in memory
47
  if dataset_cache is not None:
48
  return dataset_cache
49
 
50
- # Load from persistent cache
51
  if os.path.exists(CACHE_PATH):
52
  logger.info(f"Loading dataset from cache at {CACHE_PATH}")
53
  dataset_cache = load_from_disk(CACHE_PATH)
54
  logger.info(f"Loaded {len(dataset_cache)} rows from cached dataset")
55
  return dataset_cache
56
 
57
- # Download once, then save
58
  logger.info(f"Downloading HF dataset: {DATASET_NAME}")
59
  ds = load_dataset(DATASET_NAME, split="train")
60
 
@@ -93,13 +90,13 @@ def serve_index():
93
 
94
 
95
  # ---------------------------------------------------------
96
- # List all tickers
97
  # ---------------------------------------------------------
98
  @app.get("/tickers")
99
  def get_tickers():
100
  ds = load_hf_dataset()
101
- tickers = sorted(set([t.upper() for t in ds["ticker"]]))
102
- return {"tickers": tickers}
103
 
104
 
105
  # ---------------------------------------------------------
@@ -112,7 +109,7 @@ def get_transcript(symbol: str):
112
 
113
  logger.info(f"Fetching transcript for: {symbol}")
114
 
115
- rows = [r for r in ds if r["ticker"].upper() == symbol]
116
 
117
  if not rows:
118
  raise HTTPException(status_code=404, detail=f"No transcript found for {symbol}")
@@ -121,7 +118,53 @@ def get_transcript(symbol: str):
121
 
122
  return {"symbol": symbol, "records": safe_rows}
123
 
124
- @app.get("/run-once")
125
- def run_once():
126
- import run_once
127
- return {"status": "executed"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  """
44
  global dataset_cache
45
 
 
46
  if dataset_cache is not None:
47
  return dataset_cache
48
 
 
49
  if os.path.exists(CACHE_PATH):
50
  logger.info(f"Loading dataset from cache at {CACHE_PATH}")
51
  dataset_cache = load_from_disk(CACHE_PATH)
52
  logger.info(f"Loaded {len(dataset_cache)} rows from cached dataset")
53
  return dataset_cache
54
 
 
55
  logger.info(f"Downloading HF dataset: {DATASET_NAME}")
56
  ds = load_dataset(DATASET_NAME, split="train")
57
 
 
90
 
91
 
92
  # ---------------------------------------------------------
93
+ # List all symbols
94
  # ---------------------------------------------------------
95
  @app.get("/tickers")
96
  def get_tickers():
97
  ds = load_hf_dataset()
98
+ symbols = sorted(set([s.upper() for s in ds["symbol"]]))
99
+ return {"tickers": symbols}
100
 
101
 
102
  # ---------------------------------------------------------
 
109
 
110
  logger.info(f"Fetching transcript for: {symbol}")
111
 
112
+ rows = [r for r in ds if r["symbol"].upper() == symbol]
113
 
114
  if not rows:
115
  raise HTTPException(status_code=404, detail=f"No transcript found for {symbol}")
 
118
 
119
  return {"symbol": symbol, "records": safe_rows}
120
 
121
+
122
+ # ---------------------------------------------------------
123
+ # Dataset info (size + columns)
124
+ # ---------------------------------------------------------
125
+ @app.get("/dataset-info")
126
+ def dataset_info():
127
+ ds = load_hf_dataset()
128
+
129
+ info = {
130
+ "num_rows": len(ds),
131
+ "columns": ds.column_names,
132
+ "cache_path": CACHE_PATH,
133
+ }
134
+
135
+ return info
136
+
137
+
138
+ # ---------------------------------------------------------
139
+ # Dataset summary (high-level stats)
140
+ # ---------------------------------------------------------
141
+ @app.get("/dataset-summary")
142
+ def dataset_summary():
143
+ ds = load_hf_dataset()
144
+
145
+ symbols = set([s.upper() for s in ds["symbol"]])
146
+ years = set(ds["year"])
147
+ quarters = set(ds["quarter"])
148
+
149
+ dates = [d for d in ds["date"] if d is not None]
150
+ min_date = min(dates) if dates else None
151
+ max_date = max(dates) if dates else None
152
+
153
+ summary = {
154
+ "total_rows": len(ds),
155
+ "unique_symbols": len(symbols),
156
+ "symbols_sample": sorted(list(symbols))[:20],
157
+ "year_range": {
158
+ "min_year": min(years),
159
+ "max_year": max(years)
160
+ },
161
+ "quarters_present": sorted(list(quarters)),
162
+ "date_range": {
163
+ "min_date": min_date,
164
+ "max_date": max_date
165
+ },
166
+ "company_count": len(set(ds["company_id"])),
167
+ }
168
+
169
+ return summary
170
+
hf_push.sh DELETED
@@ -1,30 +0,0 @@
1
- #!/bin/bash
2
-
3
- # -----------------------------
4
- # CONFIGURATION
5
- # -----------------------------
6
- HF_USERNAME="shyameati"
7
- HF_TOKEN="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"
8
- HF_SPACE="transcripts-api"
9
-
10
- # -----------------------------
11
- # GIT SETUP
12
- # -----------------------------
13
- echo "Configuring Git remote with embedded HF token..."
14
-
15
- git remote set-url origin https://$HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/$HF_USERNAME/$HF_SPACE
16
-
17
- # -----------------------------
18
- # COMMIT + PUSH
19
- # -----------------------------
20
- echo "Staging changes..."
21
- git add .
22
-
23
- echo "Committing..."
24
- git commit -m "Auto-update from script" || echo "No changes to commit"
25
-
26
- echo "Pushing to Hugging Face..."
27
- git push origin main
28
-
29
- echo "Done."
30
-