import requests from bs4 import BeautifulSoup from urllib.parse import urlparse, urlunparse import pandas as pd headers = {'User-Agent': 'myprogram/1.0'} def scrape_headings(url): try: # Check if the URL has a scheme (http/https), and add one if missing parsed_url = urlparse(url) if not parsed_url.scheme: url = urlunparse(('http',) + parsed_url[1:]) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } # Send an HTTP GET request to the specified URL response = requests.get(url, headers=headers) # Check if the request was successful if response.status_code == 200: # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(response.text, 'html.parser') # Find all the heading elements (h1, h2, h3, etc.) headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) # Extract the result of headings with their types heading_data = [{"Heading Type": heading.name, "Text": heading.text.strip()} for heading in headings] # Convert to DataFrame and save to CSV df = pd.DataFrame(heading_data) df.to_csv('output.csv', index=False) else: print(f"Failed to retrieve content from {url}. Status code: {response.status_code}") except Exception as e: print(f"An error occurred: {str(e)}") def main(): url = input("Enter the URL: ") scrape_headings(url) # df = pd.DataFrame(scrape_headings(url)) search_again = input("Do you want to search again? y/n:").lower() if search_again == 'y': # df.to_csv('output.csv') main() else: # df.to_csv('output.csv') exit() if __name__ == "__main__": main()