diff --git a/FETCH_HEAD b/FETCH_HEAD new file mode 100644 index 0000000..e69de29 diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/git b/git new file mode 100644 index 0000000..e69de29 diff --git a/main.py b/main.py index c8dd9fa..bf75576 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,7 @@ from bs4 import BeautifulSoup from urllib.parse import urlparse, urlunparse import pandas as pd -def scrape_headings(url): +def scrape_headings(url, output_path): try: # Check if the URL has a scheme (http/https), and add one if missing parsed_url = urlparse(url) @@ -30,7 +30,8 @@ def scrape_headings(url): # Convert to DataFrame and save to CSV df = pd.DataFrame(heading_data) - df.to_csv('output.csv', index=False) + df.to_csv(output_path, index=False) + return heading_data else: print(f"Failed to retrieve content from {url}. Status code: {response.status_code}") @@ -41,8 +42,9 @@ def scrape_headings(url): def main(): - url = input("Enter the URL: ")ls - scrape_headings(url) + url = input("Enter the URL: ") + output_path = 'output.csv' + scrape_headings(url, output_path) # df = pd.DataFrame(scrape_headings(url)) search_again = input("Do you want to search again? y/n:").lower() if search_again == 'y':