Spaces:
Runtime error
Runtime error
minor changes
Browse files- src/config.py +4 -5
- src/podcast_data.py +6 -4
src/config.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from dataclasses import dataclass
|
|
|
2 |
|
3 |
|
4 |
@dataclass
|
@@ -6,11 +7,9 @@ class Config:
|
|
6 |
playlist_url: str = "https://www.youtube.com/playlist?list=PLD80i8An1OEEb1jP0sjEyiLG8ULRXFob_"
|
7 |
|
8 |
# paths
|
9 |
-
root_data_dir:
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
# artifacts
|
14 |
yt_podcast_data_artifact: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest"
|
15 |
summarized_data_artifact: str = "gladiator/gradient_dissent_bot/summary_data:latest"
|
16 |
summarized_que_data_artifact: str = "gladiator/gradient_dissent_bot/summary_que_data:latest"
|
|
|
1 |
from dataclasses import dataclass
|
2 |
+
from pathlib import Path
|
3 |
|
4 |
|
5 |
@dataclass
|
|
|
7 |
playlist_url: str = "https://www.youtube.com/playlist?list=PLD80i8An1OEEb1jP0sjEyiLG8ULRXFob_"
|
8 |
|
9 |
# paths
|
10 |
+
root_data_dir: Path = Path("data")
|
11 |
+
# wandb
|
12 |
+
project_name: str = "gradient_dissent_qabot"
|
|
|
|
|
13 |
yt_podcast_data_artifact: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest"
|
14 |
summarized_data_artifact: str = "gladiator/gradient_dissent_bot/summary_data:latest"
|
15 |
summarized_que_data_artifact: str = "gladiator/gradient_dissent_bot/summary_que_data:latest"
|
src/podcast_data.py
CHANGED
@@ -2,11 +2,11 @@ import time
|
|
2 |
from dataclasses import asdict
|
3 |
|
4 |
import pandas as pd
|
|
|
5 |
from langchain.document_loaders import YoutubeLoader
|
6 |
from pytube import Playlist, YouTube
|
7 |
from tqdm import tqdm
|
8 |
|
9 |
-
import wandb
|
10 |
from config import config
|
11 |
|
12 |
|
@@ -35,7 +35,7 @@ def retry_access_yt_object(url, max_retries=5, interval_secs=5):
|
|
35 |
|
36 |
|
37 |
if __name__ == "__main__":
|
38 |
-
run = wandb.init(project=
|
39 |
|
40 |
playlist = Playlist(config.playlist_url)
|
41 |
playlist_video_urls = playlist.video_urls
|
@@ -62,10 +62,12 @@ if __name__ == "__main__":
|
|
62 |
|
63 |
print(f"Total podcast episodes scraped: {len(video_data)}")
|
64 |
|
|
|
65 |
df = pd.DataFrame(video_data)
|
66 |
-
df.to_csv(config.
|
67 |
|
68 |
-
|
|
|
69 |
artifact.add_file(config.yt_scraped_data_path)
|
70 |
run.log_artifact(artifact)
|
71 |
|
|
|
2 |
from dataclasses import asdict
|
3 |
|
4 |
import pandas as pd
|
5 |
+
import wandb
|
6 |
from langchain.document_loaders import YoutubeLoader
|
7 |
from pytube import Playlist, YouTube
|
8 |
from tqdm import tqdm
|
9 |
|
|
|
10 |
from config import config
|
11 |
|
12 |
|
|
|
35 |
|
36 |
|
37 |
if __name__ == "__main__":
|
38 |
+
run = wandb.init(project=config.project_name, job_type="dataset", config=asdict(config))
|
39 |
|
40 |
playlist = Playlist(config.playlist_url)
|
41 |
playlist_video_urls = playlist.video_urls
|
|
|
62 |
|
63 |
print(f"Total podcast episodes scraped: {len(video_data)}")
|
64 |
|
65 |
+
# save the scraped data to a csv file
|
66 |
df = pd.DataFrame(video_data)
|
67 |
+
df.to_csv(config.root_data_dir / "yt_podcast_transcript.csv", index=False)
|
68 |
|
69 |
+
# upload the scraped data to wandb
|
70 |
+
artifact = wandb.Artifact("yt_podcast_transcript", type="dataset")
|
71 |
artifact.add_file(config.yt_scraped_data_path)
|
72 |
run.log_artifact(artifact)
|
73 |
|