Spaces:
Sleeping
Sleeping
File size: 2,842 Bytes
4b860ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import json
import pandas as pd
# Initialize WebDriver
driver = webdriver.Chrome()
# Base URL for chapters
base_url = "https://devgan.in/bns/chapter_"
# Number of chapters
num_chapters = 20
# Initialize an empty list to store the data
dataset = []
# Loop through each chapter
for chapter_num in range(1, num_chapters + 1):
chapter_url = f"{base_url}{str(chapter_num).zfill(2)}.php"
print(f"Scraping: {chapter_url}")
# Open the chapter page
driver.get(chapter_url)
time.sleep(3) # Wait for the page to load
# Get the chapter name
try:
chapter_name = driver.find_element(By.TAG_NAME, "h1").text.strip()
print(f"Chapter: {chapter_name}")
except Exception as e:
print(f"Error fetching chapter name: {e}")
continue
# Find all sections (subClose and sectxt)
try:
section_headers = driver.find_elements(By.CSS_SELECTOR, "h2.subClose")
section_contents = driver.find_elements(By.CSS_SELECTOR, "div.sectxt")
if len(section_headers) != len(section_contents):
print(f"Mismatch in sections and content: {len(section_headers)} headers, {len(section_contents)} contents.")
continue
for header, content in zip(section_headers, section_contents):
try:
section_title = header.text.strip()
# Expand hidden content if necessary
if content.value_of_css_property("display") == "none":
driver.execute_script("arguments[0].style.display = 'block';", content)
section_content = content.text.strip()
# Add data to the dataset
dataset.append({
"chapter": chapter_name,
"section_title": section_title,
"section_content": section_content
})
print(f"Processed section: {section_title}")
except Exception as e:
print(f"Error processing section: {e}")
continue
except Exception as e:
print(f"Error finding sections: {e}")
continue
# Pause between chapters to avoid overwhelming the server
time.sleep(2)
# Close the WebDriver
driver.quit()
# Save the dataset to a JSON file
output_file_json = "bns_dataset.json"
with open(output_file_json, "w", encoding="utf-8") as json_file:
json.dump(dataset, json_file, ensure_ascii=False, indent=4)
print(f"Dataset saved to {output_file_json}")
# Save the dataset to a CSV file
output_file_csv = "bns_dataset.csv"
df = pd.DataFrame(dataset)
df.to_csv(output_file_csv, index=False)
print(f"Dataset saved to {output_file_csv}")
|