Udated main.py "added user-agent headers"

2023-12-29 15:02:15 -05:00
1 changed files with 21 additions and 23 deletions
@@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
 from urllib.parse import urlparse, urlunparse
 import pandas as pd

+headers = {'User-Agent': 'myprogram/1.0'}
 def scrape_headings(url):
    try:
        # Check if the URL has a scheme (http/https), and add one if missing
@@ -10,10 +11,6 @@ def scrape_headings(url):
        if not parsed_url.scheme:
            url = urlunparse(('http',) + parsed_url[1:])

-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
-        }
-
        # Send an HTTP GET request to the specified URL
        response = requests.get(url, headers=headers)

@@ -25,12 +22,10 @@ def scrape_headings(url):
            # Find all the heading elements (h1, h2, h3, etc.)
            headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

-            # Extract the result of headings with their types
-            heading_data = [{"Heading Type": heading.name, "Text": heading.text.strip()} for heading in headings]
-
-            # Convert to DataFrame and save to CSV
-            df = pd.DataFrame(heading_data)
-            df.to_csv('output.csv', index=False)
+            # Extract and put to csv the result of headings
+            heading_list = [heading.text for heading in headings]
+            df = pd.DataFrame(heading_list)
+            df.to_csv('output.csv')

        else:
            print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
@@ -39,18 +34,21 @@ def scrape_headings(url):
        print(f"An error occurred: {str(e)}")


+# To check if to do another search for URL
+def main_start():

-def main():
+    if __name__ == "__main__":
        url = input("Enter the URL: ")
        scrape_headings(url)
        # df = pd.DataFrame(scrape_headings(url))
        search_again = input("Do you want to search again? y/n:").lower()
        if search_again == 'y':
            # df.to_csv('output.csv')
-        main()
+            main_start()
        else:
            # df.to_csv('output.csv')
            exit()

-if __name__ == "__main__":
-    main()
+
+main_start()
+