Add Browser agent

Merge remote-tracking branch 'origin/master'
included csv function
2023-12-30 22:07:12 -05:00 · 2023-12-23 23:58:20 +08:00 · 2023-12-23 23:55:11 +08:00 · 2023-12-23 23:40:41 +08:00 · 2023-12-23 23:34:57 +08:00 · 2023-12-23 23:14:12 +08:00
3 changed files with 57 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
 # WebScrape
--- a/BIN
+++ b/BIN
--- a/main.py
+++ b/main.py
@ -0,0 +1,56 @@
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, urlunparse
 import pandas as pd
 def scrape_headings(url):
    try:
        # Check if the URL has a scheme (http/https), and add one if missing
        parsed_url = urlparse(url)
        if not parsed_url.scheme:
            url = urlunparse(('http',) + parsed_url[1:])
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }
        # Send an HTTP GET request to the specified URL
        response = requests.get(url, headers=headers)
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            # Find all the heading elements (h1, h2, h3, etc.)
            headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
            # Extract the result of headings with their types
            heading_data = [{"Heading Type": heading.name, "Text": heading.text.strip()} for heading in headings]
            # Convert to DataFrame and save to CSV
            df = pd.DataFrame(heading_data)
            df.to_csv('output.csv', index=False)
        else:
            print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")
 def main():
    url = input("Enter the URL: ")
    scrape_headings(url)
    # df = pd.DataFrame(scrape_headings(url))
    search_again = input("Do you want to search again? y/n:").lower()
    if search_again == 'y':
        # df.to_csv('output.csv')
        main()
    else:
        # df.to_csv('output.csv')
        exit()
 if __name__ == "__main__":
    main()
Author	SHA1	Message	Date
master	89b432ea2a	Add Browser agent	2023-12-30 22:07:12 -05:00
cid	0c3cfdf0e9	Merge remote-tracking branch 'origin/master'	2023-12-23 23:58:20 +08:00
cid	8b837b2d7c	included csv function	2023-12-23 23:55:11 +08:00
cid	d0c541a318	added csv	2023-12-23 23:40:41 +08:00
cid	bc1619f11e	added a csv output function	2023-12-23 23:34:57 +08:00
cid	e2fe8ab8e8	added csv function	2023-12-23 23:14:12 +08:00
cid	0627bd446b	Merge remote-tracking branch 'origin/main'	2023-12-23 09:35:55 +08:00
cid	96d8926ac8	added chromedriver and main.py	2023-12-23 09:00:11 +08:00
Cid Claudio Biñas	3595537ed1	Initial commit	2023-12-22 16:53:14 -08:00