Add Browser agent

Merge remote-tracking branch 'origin/master'
included csv function
2023-12-30 22:07:12 -05:00 · 2023-12-23 23:58:20 +08:00 · 2023-12-23 23:55:11 +08:00 · 2023-12-23 23:40:41 +08:00 · 2023-12-23 23:34:57 +08:00 · 2023-12-23 23:14:12 +08:00
3 changed files with 57 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
+# WebScrape
--- a/BIN
+++ b/BIN
--- a/main.py
+++ b/main.py
@ -0,0 +1,56 @@
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse, urlunparse
+import pandas as pd
+
+def scrape_headings(url):
+    try:
+        # Check if the URL has a scheme (http/https), and add one if missing
+        parsed_url = urlparse(url)
+        if not parsed_url.scheme:
+            url = urlunparse(('http',) + parsed_url[1:])
+
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
+        }
+
+        # Send an HTTP GET request to the specified URL
+        response = requests.get(url, headers=headers)
+
+        # Check if the request was successful
+        if response.status_code == 200:
+            # Parse the HTML content using BeautifulSoup
+            soup = BeautifulSoup(response.text, 'html.parser')
+
+            # Find all the heading elements (h1, h2, h3, etc.)
+            headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
+
+            # Extract the result of headings with their types
+            heading_data = [{"Heading Type": heading.name, "Text": heading.text.strip()} for heading in headings]
+
+            # Convert to DataFrame and save to CSV
+            df = pd.DataFrame(heading_data)
+            df.to_csv('output.csv', index=False)
+
+        else:
+            print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
+
+    except Exception as e:
+        print(f"An error occurred: {str(e)}")
+
+
+
+def main():
+    url = input("Enter the URL: ")
+    scrape_headings(url)
+    # df = pd.DataFrame(scrape_headings(url))
+    search_again = input("Do you want to search again? y/n:").lower()
+    if search_again == 'y':
+        # df.to_csv('output.csv')
+        main()
+    else:
+        # df.to_csv('output.csv')
+        exit()
+
+if __name__ == "__main__":
+    main()
Author	SHA1	Message	Date
master	89b432ea2a	Add Browser agent	2023-12-30 22:07:12 -05:00
cid	0c3cfdf0e9	Merge remote-tracking branch 'origin/master'	2023-12-23 23:58:20 +08:00
cid	8b837b2d7c	included csv function	2023-12-23 23:55:11 +08:00
cid	d0c541a318	added csv	2023-12-23 23:40:41 +08:00
cid	bc1619f11e	added a csv output function	2023-12-23 23:34:57 +08:00
cid	e2fe8ab8e8	added csv function	2023-12-23 23:14:12 +08:00
cid	0627bd446b	Merge remote-tracking branch 'origin/main'	2023-12-23 09:35:55 +08:00
cid	96d8926ac8	added chromedriver and main.py	2023-12-23 09:00:11 +08:00
Cid Claudio Biñas	3595537ed1	Initial commit	2023-12-22 16:53:14 -08:00