Spaces:

chaithanyashaji
/

BNS-Law4her

Sleeping

App Files Files Community

BNS-Law4her / bns_scraper.py

chaithanyashaji

Upload 9 files

4b860ec verified about 1 month ago

raw

history blame

2.84 kB

	from selenium import webdriver
	from selenium.webdriver.common.by import By
	import time
	import json
	import pandas as pd

	# Initialize WebDriver
	driver = webdriver.Chrome()

	# Base URL for chapters
	base_url = "https://devgan.in/bns/chapter_"

	# Number of chapters
	num_chapters = 20

	# Initialize an empty list to store the data
	dataset = []

	# Loop through each chapter
	for chapter_num in range(1, num_chapters + 1):
	chapter_url = f"{base_url}{str(chapter_num).zfill(2)}.php"
	print(f"Scraping: {chapter_url}")

	# Open the chapter page
	driver.get(chapter_url)
	time.sleep(3) # Wait for the page to load

	# Get the chapter name
	try:
	chapter_name = driver.find_element(By.TAG_NAME, "h1").text.strip()
	print(f"Chapter: {chapter_name}")
	except Exception as e:
	print(f"Error fetching chapter name: {e}")
	continue

	# Find all sections (subClose and sectxt)
	try:
	section_headers = driver.find_elements(By.CSS_SELECTOR, "h2.subClose")
	section_contents = driver.find_elements(By.CSS_SELECTOR, "div.sectxt")

	if len(section_headers) != len(section_contents):
	print(f"Mismatch in sections and content: {len(section_headers)} headers, {len(section_contents)} contents.")
	continue

	for header, content in zip(section_headers, section_contents):
	try:
	section_title = header.text.strip()
	# Expand hidden content if necessary
	if content.value_of_css_property("display") == "none":
	driver.execute_script("arguments[0].style.display = 'block';", content)
	section_content = content.text.strip()

	# Add data to the dataset
	dataset.append({
	"chapter": chapter_name,
	"section_title": section_title,
	"section_content": section_content
	})
	print(f"Processed section: {section_title}")
	except Exception as e:
	print(f"Error processing section: {e}")
	continue
	except Exception as e:
	print(f"Error finding sections: {e}")
	continue

	# Pause between chapters to avoid overwhelming the server
	time.sleep(2)

	# Close the WebDriver
	driver.quit()

	# Save the dataset to a JSON file
	output_file_json = "bns_dataset.json"
	with open(output_file_json, "w", encoding="utf-8") as json_file:
	json.dump(dataset, json_file, ensure_ascii=False, indent=4)
	print(f"Dataset saved to {output_file_json}")

	# Save the dataset to a CSV file
	output_file_csv = "bns_dataset.csv"
	df = pd.DataFrame(dataset)
	df.to_csv(output_file_csv, index=False)
	print(f"Dataset saved to {output_file_csv}")