Spaces:
Runtime error
Runtime error
minor changes for new wandb project
Browse files- .gitignore +2 -1
- data/yt_podcast_transcript.csv +0 -0
- src/config.py +5 -3
- src/summarize.py +13 -14
.gitignore
CHANGED
@@ -161,4 +161,5 @@ cython_debug/
|
|
161 |
notebooks/
|
162 |
downloaded_data/
|
163 |
wandb/
|
164 |
-
.vscode/
|
|
|
|
161 |
notebooks/
|
162 |
downloaded_data/
|
163 |
wandb/
|
164 |
+
.vscode/
|
165 |
+
downloaded_artifacts/
|
data/yt_podcast_transcript.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
src/config.py
CHANGED
@@ -8,11 +8,13 @@ class Config:
|
|
8 |
|
9 |
# paths
|
10 |
root_data_dir: Path = Path("data")
|
|
|
|
|
11 |
# wandb
|
12 |
project_name: str = "gradient_dissent_qabot"
|
13 |
-
yt_podcast_data_artifact: str = "gladiator/
|
14 |
-
summarized_data_artifact: str = "gladiator/gradient_dissent_bot/summary_data:latest"
|
15 |
-
summarized_que_data_artifact: str = "gladiator/gradient_dissent_bot/summary_que_data:latest"
|
16 |
|
17 |
|
18 |
config = Config()
|
|
|
8 |
|
9 |
# paths
|
10 |
root_data_dir: Path = Path("data")
|
11 |
+
root_artifact_dir: Path = Path("downloaded_artifacts")
|
12 |
+
|
13 |
# wandb
|
14 |
project_name: str = "gradient_dissent_qabot"
|
15 |
+
yt_podcast_data_artifact: str = "gladiator/gradient_dissent_qabot/yt_podcast_transcript:latest"
|
16 |
+
# summarized_data_artifact: str = "gladiator/gradient_dissent_bot/summary_data:latest"
|
17 |
+
# summarized_que_data_artifact: str = "gladiator/gradient_dissent_bot/summary_que_data:latest"
|
18 |
|
19 |
|
20 |
config = Config()
|
src/summarize.py
CHANGED
@@ -2,7 +2,6 @@ import os
|
|
2 |
from dataclasses import asdict
|
3 |
|
4 |
import pandas as pd
|
5 |
-
import wandb
|
6 |
from langchain.callbacks import get_openai_callback
|
7 |
from langchain.chains.summarize import load_summarize_chain
|
8 |
from langchain.chat_models import ChatOpenAI
|
@@ -12,16 +11,15 @@ from langchain.text_splitter import TokenTextSplitter
|
|
12 |
from tqdm import tqdm
|
13 |
from wandb.integration.langchain import WandbTracer
|
14 |
|
|
|
15 |
from config import config
|
16 |
|
17 |
|
18 |
-
def get_data(
|
19 |
-
artifact_name: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest",
|
20 |
-
total_episodes: int = None,
|
21 |
-
):
|
22 |
podcast_artifact = wandb.use_artifact(artifact_name, type="dataset")
|
23 |
-
podcast_artifact_dir = podcast_artifact.download(config.
|
24 |
-
|
|
|
25 |
if total_episodes is not None:
|
26 |
df = df.iloc[:total_episodes]
|
27 |
return df
|
@@ -77,15 +75,14 @@ if __name__ == "__main__":
|
|
77 |
# initialize wandb tracer
|
78 |
WandbTracer.init(
|
79 |
{
|
80 |
-
"project":
|
81 |
-
"name": "summarize_3",
|
82 |
"job_type": "summarize",
|
83 |
"config": asdict(config),
|
84 |
}
|
85 |
)
|
86 |
|
87 |
# get scraped data
|
88 |
-
df = get_data(artifact_name=config.yt_podcast_data_artifact, total_episodes=
|
89 |
|
90 |
summaries = []
|
91 |
with get_openai_callback() as cb:
|
@@ -110,15 +107,17 @@ if __name__ == "__main__":
|
|
110 |
|
111 |
df["summary"] = summaries
|
112 |
|
113 |
-
#
|
114 |
-
path_to_save = os.path.join(config.root_data_dir, "
|
115 |
df.to_csv(path_to_save)
|
116 |
-
|
|
|
|
|
117 |
artifact.add_file(path_to_save)
|
118 |
wandb.log_artifact(artifact)
|
119 |
|
120 |
# create wandb table
|
121 |
table = wandb.Table(dataframe=df)
|
122 |
-
wandb.log({"
|
123 |
|
124 |
WandbTracer.finish()
|
|
|
2 |
from dataclasses import asdict
|
3 |
|
4 |
import pandas as pd
|
|
|
5 |
from langchain.callbacks import get_openai_callback
|
6 |
from langchain.chains.summarize import load_summarize_chain
|
7 |
from langchain.chat_models import ChatOpenAI
|
|
|
11 |
from tqdm import tqdm
|
12 |
from wandb.integration.langchain import WandbTracer
|
13 |
|
14 |
+
import wandb
|
15 |
from config import config
|
16 |
|
17 |
|
18 |
+
def get_data(artifact_name: str, total_episodes: int = None):
|
|
|
|
|
|
|
19 |
podcast_artifact = wandb.use_artifact(artifact_name, type="dataset")
|
20 |
+
podcast_artifact_dir = podcast_artifact.download(config.root_artifact_dir)
|
21 |
+
filename = artifact_name.split(":")[0].split("/")[-1]
|
22 |
+
df = pd.read_csv(os.path.join(podcast_artifact_dir, f"{filename}.csv"))
|
23 |
if total_episodes is not None:
|
24 |
df = df.iloc[:total_episodes]
|
25 |
return df
|
|
|
75 |
# initialize wandb tracer
|
76 |
WandbTracer.init(
|
77 |
{
|
78 |
+
"project": config.project_name,
|
|
|
79 |
"job_type": "summarize",
|
80 |
"config": asdict(config),
|
81 |
}
|
82 |
)
|
83 |
|
84 |
# get scraped data
|
85 |
+
df = get_data(artifact_name=config.yt_podcast_data_artifact, total_episodes=2)
|
86 |
|
87 |
summaries = []
|
88 |
with get_openai_callback() as cb:
|
|
|
107 |
|
108 |
df["summary"] = summaries
|
109 |
|
110 |
+
# save data
|
111 |
+
path_to_save = os.path.join(config.root_data_dir, "summarized_podcasts.csv")
|
112 |
df.to_csv(path_to_save)
|
113 |
+
|
114 |
+
# log to wandb artifact
|
115 |
+
artifact = wandb.Artifact("summarized_podcasts", type="dataset")
|
116 |
artifact.add_file(path_to_save)
|
117 |
wandb.log_artifact(artifact)
|
118 |
|
119 |
# create wandb table
|
120 |
table = wandb.Table(dataframe=df)
|
121 |
+
wandb.log({"summarized_podcasts": table})
|
122 |
|
123 |
WandbTracer.finish()
|