BNS-Law4her / bns_scraper.py
chaithanyashaji's picture
Upload 9 files
4b860ec verified
raw
history blame
2.84 kB
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import json
import pandas as pd
# Initialize WebDriver
driver = webdriver.Chrome()
# Base URL for chapters
base_url = "https://devgan.in/bns/chapter_"
# Number of chapters
num_chapters = 20
# Initialize an empty list to store the data
dataset = []
# Loop through each chapter
for chapter_num in range(1, num_chapters + 1):
chapter_url = f"{base_url}{str(chapter_num).zfill(2)}.php"
print(f"Scraping: {chapter_url}")
# Open the chapter page
driver.get(chapter_url)
time.sleep(3) # Wait for the page to load
# Get the chapter name
try:
chapter_name = driver.find_element(By.TAG_NAME, "h1").text.strip()
print(f"Chapter: {chapter_name}")
except Exception as e:
print(f"Error fetching chapter name: {e}")
continue
# Find all sections (subClose and sectxt)
try:
section_headers = driver.find_elements(By.CSS_SELECTOR, "h2.subClose")
section_contents = driver.find_elements(By.CSS_SELECTOR, "div.sectxt")
if len(section_headers) != len(section_contents):
print(f"Mismatch in sections and content: {len(section_headers)} headers, {len(section_contents)} contents.")
continue
for header, content in zip(section_headers, section_contents):
try:
section_title = header.text.strip()
# Expand hidden content if necessary
if content.value_of_css_property("display") == "none":
driver.execute_script("arguments[0].style.display = 'block';", content)
section_content = content.text.strip()
# Add data to the dataset
dataset.append({
"chapter": chapter_name,
"section_title": section_title,
"section_content": section_content
})
print(f"Processed section: {section_title}")
except Exception as e:
print(f"Error processing section: {e}")
continue
except Exception as e:
print(f"Error finding sections: {e}")
continue
# Pause between chapters to avoid overwhelming the server
time.sleep(2)
# Close the WebDriver
driver.quit()
# Save the dataset to a JSON file
output_file_json = "bns_dataset.json"
with open(output_file_json, "w", encoding="utf-8") as json_file:
json.dump(dataset, json_file, ensure_ascii=False, indent=4)
print(f"Dataset saved to {output_file_json}")
# Save the dataset to a CSV file
output_file_csv = "bns_dataset.csv"
df = pd.DataFrame(dataset)
df.to_csv(output_file_csv, index=False)
print(f"Dataset saved to {output_file_csv}")