Add output path

Fix browser agent
Merge branch 'master' of gitea.gorillamail.biz:cid/Web_Scrape
2023-12-31 23:50:03 -05:00 · 2023-12-30 23:17:41 -05:00 · 2023-12-30 23:15:47 -05:00 · 2023-12-29 15:02:15 -05:00
4 changed files with 5 additions and 3 deletions
--- a/0
+++ b/0
--- a/init.py
+++ b/init.py
--- a/0
+++ b/0
--- a/main.py
+++ b/main.py
@ -3,7 +3,7 @@ from bs4 import BeautifulSoup
 from urllib.parse import urlparse, urlunparse
 import pandas as pd

-def scrape_headings(url):
+def scrape_headings(url, output_path):
    try:
        # Check if the URL has a scheme (http/https), and add one if missing
        parsed_url = urlparse(url)
@ -30,7 +30,8 @@ def scrape_headings(url):

            # Convert to DataFrame and save to CSV
            df = pd.DataFrame(heading_data)
-            df.to_csv('output.csv', index=False)
+            df.to_csv(output_path, index=False)
+            return heading_data

        else:
            print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
@ -42,7 +43,8 @@ def scrape_headings(url):

 def main():
    url = input("Enter the URL: ")
-    scrape_headings(url)
+    output_path = 'output.csv'
+    scrape_headings(url, output_path)
    # df = pd.DataFrame(scrape_headings(url))
    search_again = input("Do you want to search again? y/n:").lower()
    if search_again == 'y':
Author	SHA1	Message	Date
master	fe38c2226f	Add output path	2023-12-31 23:50:03 -05:00
master	8e34d94acd	Fix browser agent	2023-12-30 23:17:41 -05:00
master	125f968a47	Merge branch 'master' of gitea.gorillamail.biz:cid/Web_Scrape	2023-12-30 23:15:47 -05:00
cid	0c9cea5225	Udated main.py "added user-agent headers"	2023-12-29 15:02:15 -05:00