File size: 2,842 Bytes
4b860ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import json
import pandas as pd

# Initialize WebDriver
driver = webdriver.Chrome()

# Base URL for chapters
base_url = "https://devgan.in/bns/chapter_"

# Number of chapters
num_chapters = 20

# Initialize an empty list to store the data
dataset = []

# Loop through each chapter
for chapter_num in range(1, num_chapters + 1):
    chapter_url = f"{base_url}{str(chapter_num).zfill(2)}.php"
    print(f"Scraping: {chapter_url}")

    # Open the chapter page
    driver.get(chapter_url)
    time.sleep(3)  # Wait for the page to load

    # Get the chapter name
    try:
        chapter_name = driver.find_element(By.TAG_NAME, "h1").text.strip()
        print(f"Chapter: {chapter_name}")
    except Exception as e:
        print(f"Error fetching chapter name: {e}")
        continue

    # Find all sections (subClose and sectxt)
    try:
        section_headers = driver.find_elements(By.CSS_SELECTOR, "h2.subClose")
        section_contents = driver.find_elements(By.CSS_SELECTOR, "div.sectxt")

        if len(section_headers) != len(section_contents):
            print(f"Mismatch in sections and content: {len(section_headers)} headers, {len(section_contents)} contents.")
            continue

        for header, content in zip(section_headers, section_contents):
            try:
                section_title = header.text.strip()
                # Expand hidden content if necessary
                if content.value_of_css_property("display") == "none":
                    driver.execute_script("arguments[0].style.display = 'block';", content)
                section_content = content.text.strip()

                # Add data to the dataset
                dataset.append({
                    "chapter": chapter_name,
                    "section_title": section_title,
                    "section_content": section_content
                })
                print(f"Processed section: {section_title}")
            except Exception as e:
                print(f"Error processing section: {e}")
                continue
    except Exception as e:
        print(f"Error finding sections: {e}")
        continue

    # Pause between chapters to avoid overwhelming the server
    time.sleep(2)

# Close the WebDriver
driver.quit()

# Save the dataset to a JSON file
output_file_json = "bns_dataset.json"
with open(output_file_json, "w", encoding="utf-8") as json_file:
    json.dump(dataset, json_file, ensure_ascii=False, indent=4)
print(f"Dataset saved to {output_file_json}")

# Save the dataset to a CSV file
output_file_csv = "bns_dataset.csv"
df = pd.DataFrame(dataset)
df.to_csv(output_file_csv, index=False)
print(f"Dataset saved to {output_file_csv}")