Merge remote-tracking branch 'origin/master'

included csv function
added csv
2023-12-23 23:58:20 +08:00 · 2023-12-23 23:55:11 +08:00 · 2023-12-23 23:40:41 +08:00 · 2023-12-23 23:34:57 +08:00 · 2023-12-23 23:14:12 +08:00 · 2023-12-23 09:35:55 +08:00
3 changed files with 54 additions and 0 deletions
@@ -0,0 +1 @@
 # WebScrape
@@ -0,0 +1,53 @@
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, urlunparse
 import pandas as pd
 def scrape_headings(url):
    try:
        # Check if the URL has a scheme (http/https), and add one if missing
        parsed_url = urlparse(url)
        if not parsed_url.scheme:
            url = urlunparse(('http',) + parsed_url[1:])
        # Send an HTTP GET request to the specified URL
        response = requests.get(url)
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            # Find all the heading elements (h1, h2, h3, etc.)
            headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
            # Extract and put to csv the result of headings
            heading_list = [heading.text for heading in headings]
            df = pd.DataFrame(heading_list)
            df.to_csv('output.csv')
        else:
            print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")
 # To check if to do another search for URL
 def main_start():
    if __name__ == "__main__":
        url = input("Enter the URL: ")
        scrape_headings(url)
        # df = pd.DataFrame(scrape_headings(url))
        search_again = input("Do you want to search again? y/n:").lower()
        if search_again == 'y':
            # df.to_csv('output.csv')
            main_start()
        else:
            # df.to_csv('output.csv')
            exit()
 main_start()
Author	SHA1	Message	Date
cid	0c3cfdf0e9	Merge remote-tracking branch 'origin/master'	2023-12-23 23:58:20 +08:00
cid	8b837b2d7c	included csv function	2023-12-23 23:55:11 +08:00
cid	d0c541a318	added csv	2023-12-23 23:40:41 +08:00
cid	bc1619f11e	added a csv output function	2023-12-23 23:34:57 +08:00
cid	e2fe8ab8e8	added csv function	2023-12-23 23:14:12 +08:00
cid	0627bd446b	Merge remote-tracking branch 'origin/main'	2023-12-23 09:35:55 +08:00
cid	96d8926ac8	added chromedriver and main.py	2023-12-23 09:00:11 +08:00
Cid Claudio Biñas	3595537ed1	Initial commit	2023-12-22 16:53:14 -08:00