Spaces:
Sleeping
Sleeping
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
import time | |
import json | |
import pandas as pd | |
# Initialize WebDriver | |
driver = webdriver.Chrome() | |
# Base URL for chapters | |
base_url = "https://devgan.in/bns/chapter_" | |
# Number of chapters | |
num_chapters = 20 | |
# Initialize an empty list to store the data | |
dataset = [] | |
# Loop through each chapter | |
for chapter_num in range(1, num_chapters + 1): | |
chapter_url = f"{base_url}{str(chapter_num).zfill(2)}.php" | |
print(f"Scraping: {chapter_url}") | |
# Open the chapter page | |
driver.get(chapter_url) | |
time.sleep(3) # Wait for the page to load | |
# Get the chapter name | |
try: | |
chapter_name = driver.find_element(By.TAG_NAME, "h1").text.strip() | |
print(f"Chapter: {chapter_name}") | |
except Exception as e: | |
print(f"Error fetching chapter name: {e}") | |
continue | |
# Find all sections (subClose and sectxt) | |
try: | |
section_headers = driver.find_elements(By.CSS_SELECTOR, "h2.subClose") | |
section_contents = driver.find_elements(By.CSS_SELECTOR, "div.sectxt") | |
if len(section_headers) != len(section_contents): | |
print(f"Mismatch in sections and content: {len(section_headers)} headers, {len(section_contents)} contents.") | |
continue | |
for header, content in zip(section_headers, section_contents): | |
try: | |
section_title = header.text.strip() | |
# Expand hidden content if necessary | |
if content.value_of_css_property("display") == "none": | |
driver.execute_script("arguments[0].style.display = 'block';", content) | |
section_content = content.text.strip() | |
# Add data to the dataset | |
dataset.append({ | |
"chapter": chapter_name, | |
"section_title": section_title, | |
"section_content": section_content | |
}) | |
print(f"Processed section: {section_title}") | |
except Exception as e: | |
print(f"Error processing section: {e}") | |
continue | |
except Exception as e: | |
print(f"Error finding sections: {e}") | |
continue | |
# Pause between chapters to avoid overwhelming the server | |
time.sleep(2) | |
# Close the WebDriver | |
driver.quit() | |
# Save the dataset to a JSON file | |
output_file_json = "bns_dataset.json" | |
with open(output_file_json, "w", encoding="utf-8") as json_file: | |
json.dump(dataset, json_file, ensure_ascii=False, indent=4) | |
print(f"Dataset saved to {output_file_json}") | |
# Save the dataset to a CSV file | |
output_file_csv = "bns_dataset.csv" | |
df = pd.DataFrame(dataset) | |
df.to_csv(output_file_csv, index=False) | |
print(f"Dataset saved to {output_file_csv}") | |