Gladiator commited on
Commit
10b23b5
1 Parent(s): bc273b1

minor changes for new wandb project

Browse files
.gitignore CHANGED
@@ -161,4 +161,5 @@ cython_debug/
161
  notebooks/
162
  downloaded_data/
163
  wandb/
164
- .vscode/
 
 
161
  notebooks/
162
  downloaded_data/
163
  wandb/
164
+ .vscode/
165
+ downloaded_artifacts/
data/yt_podcast_transcript.csv CHANGED
The diff for this file is too large to render. See raw diff
 
src/config.py CHANGED
@@ -8,11 +8,13 @@ class Config:
8
 
9
  # paths
10
  root_data_dir: Path = Path("data")
 
 
11
  # wandb
12
  project_name: str = "gradient_dissent_qabot"
13
- yt_podcast_data_artifact: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest"
14
- summarized_data_artifact: str = "gladiator/gradient_dissent_bot/summary_data:latest"
15
- summarized_que_data_artifact: str = "gladiator/gradient_dissent_bot/summary_que_data:latest"
16
 
17
 
18
  config = Config()
 
8
 
9
  # paths
10
  root_data_dir: Path = Path("data")
11
+ root_artifact_dir: Path = Path("downloaded_artifacts")
12
+
13
  # wandb
14
  project_name: str = "gradient_dissent_qabot"
15
+ yt_podcast_data_artifact: str = "gladiator/gradient_dissent_qabot/yt_podcast_transcript:latest"
16
+ # summarized_data_artifact: str = "gladiator/gradient_dissent_bot/summary_data:latest"
17
+ # summarized_que_data_artifact: str = "gladiator/gradient_dissent_bot/summary_que_data:latest"
18
 
19
 
20
  config = Config()
src/summarize.py CHANGED
@@ -2,7 +2,6 @@ import os
2
  from dataclasses import asdict
3
 
4
  import pandas as pd
5
- import wandb
6
  from langchain.callbacks import get_openai_callback
7
  from langchain.chains.summarize import load_summarize_chain
8
  from langchain.chat_models import ChatOpenAI
@@ -12,16 +11,15 @@ from langchain.text_splitter import TokenTextSplitter
12
  from tqdm import tqdm
13
  from wandb.integration.langchain import WandbTracer
14
 
 
15
  from config import config
16
 
17
 
18
- def get_data(
19
- artifact_name: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest",
20
- total_episodes: int = None,
21
- ):
22
  podcast_artifact = wandb.use_artifact(artifact_name, type="dataset")
23
- podcast_artifact_dir = podcast_artifact.download(config.root_data_dir)
24
- df = pd.read_csv(os.path.join(podcast_artifact_dir, "yt_data.csv"))
 
25
  if total_episodes is not None:
26
  df = df.iloc[:total_episodes]
27
  return df
@@ -77,15 +75,14 @@ if __name__ == "__main__":
77
  # initialize wandb tracer
78
  WandbTracer.init(
79
  {
80
- "project": "gradient_dissent_bot",
81
- "name": "summarize_3",
82
  "job_type": "summarize",
83
  "config": asdict(config),
84
  }
85
  )
86
 
87
  # get scraped data
88
- df = get_data(artifact_name=config.yt_podcast_data_artifact, total_episodes=3)
89
 
90
  summaries = []
91
  with get_openai_callback() as cb:
@@ -110,15 +107,17 @@ if __name__ == "__main__":
110
 
111
  df["summary"] = summaries
112
 
113
- # log to wandb artifact
114
- path_to_save = os.path.join(config.root_data_dir, "summary_data.csv")
115
  df.to_csv(path_to_save)
116
- artifact = wandb.Artifact("summary_data", type="dataset")
 
 
117
  artifact.add_file(path_to_save)
118
  wandb.log_artifact(artifact)
119
 
120
  # create wandb table
121
  table = wandb.Table(dataframe=df)
122
- wandb.log({"summary_data": table})
123
 
124
  WandbTracer.finish()
 
2
  from dataclasses import asdict
3
 
4
  import pandas as pd
 
5
  from langchain.callbacks import get_openai_callback
6
  from langchain.chains.summarize import load_summarize_chain
7
  from langchain.chat_models import ChatOpenAI
 
11
  from tqdm import tqdm
12
  from wandb.integration.langchain import WandbTracer
13
 
14
+ import wandb
15
  from config import config
16
 
17
 
18
+ def get_data(artifact_name: str, total_episodes: int = None):
 
 
 
19
  podcast_artifact = wandb.use_artifact(artifact_name, type="dataset")
20
+ podcast_artifact_dir = podcast_artifact.download(config.root_artifact_dir)
21
+ filename = artifact_name.split(":")[0].split("/")[-1]
22
+ df = pd.read_csv(os.path.join(podcast_artifact_dir, f"{filename}.csv"))
23
  if total_episodes is not None:
24
  df = df.iloc[:total_episodes]
25
  return df
 
75
  # initialize wandb tracer
76
  WandbTracer.init(
77
  {
78
+ "project": config.project_name,
 
79
  "job_type": "summarize",
80
  "config": asdict(config),
81
  }
82
  )
83
 
84
  # get scraped data
85
+ df = get_data(artifact_name=config.yt_podcast_data_artifact, total_episodes=2)
86
 
87
  summaries = []
88
  with get_openai_callback() as cb:
 
107
 
108
  df["summary"] = summaries
109
 
110
+ # save data
111
+ path_to_save = os.path.join(config.root_data_dir, "summarized_podcasts.csv")
112
  df.to_csv(path_to_save)
113
+
114
+ # log to wandb artifact
115
+ artifact = wandb.Artifact("summarized_podcasts", type="dataset")
116
  artifact.add_file(path_to_save)
117
  wandb.log_artifact(artifact)
118
 
119
  # create wandb table
120
  table = wandb.Table(dataframe=df)
121
+ wandb.log({"summarized_podcasts": table})
122
 
123
  WandbTracer.finish()