Web_Scrape/main.py

import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse
import pandas as pd

def scrape_headings(url, output_path):
    try:
        # Check if the URL has a scheme (http/https), and add one if missing
        parsed_url = urlparse(url)
        if not parsed_url.scheme:
            url = urlunparse(('http',) + parsed_url[1:])

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }

        # Send an HTTP GET request to the specified URL
        response = requests.get(url, headers=headers)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all the heading elements (h1, h2, h3, etc.)
            headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

            # Extract the result of headings with their types
            heading_data = [{"Heading Type": heading.name, "Text": heading.text.strip()} for heading in headings]

            # Convert to DataFrame and save to CSV
            df = pd.DataFrame(heading_data)
            df.to_csv(output_path, index=False)
            return heading_data

        else:
            print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")

    except Exception as e:
        print(f"An error occurred: {str(e)}")


def main():
    url = input("Enter the URL: ")
    output_path = 'output.csv'
    scrape_headings(url, output_path)
    # df = pd.DataFrame(scrape_headings(url))
    search_again = input("Do you want to search again? y/n:").lower()
    if search_again == 'y':
        # df.to_csv('output.csv')
        main()
    else:
        # df.to_csv('output.csv')
        exit()

if __name__ == "__main__":
    main()