Add output path

This commit is contained in:
2023-12-31 23:50:03 -05:00
parent 8e34d94acd
commit fe38c2226f
4 changed files with 6 additions and 4 deletions

0
FETCH_HEAD Normal file
View File

0
__init__.py Normal file
View File

0
git Normal file
View File

10
main.py
View File

@ -3,7 +3,7 @@ from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse from urllib.parse import urlparse, urlunparse
import pandas as pd import pandas as pd
def scrape_headings(url): def scrape_headings(url, output_path):
try: try:
# Check if the URL has a scheme (http/https), and add one if missing # Check if the URL has a scheme (http/https), and add one if missing
parsed_url = urlparse(url) parsed_url = urlparse(url)
@ -30,7 +30,8 @@ def scrape_headings(url):
# Convert to DataFrame and save to CSV # Convert to DataFrame and save to CSV
df = pd.DataFrame(heading_data) df = pd.DataFrame(heading_data)
df.to_csv('output.csv', index=False) df.to_csv(output_path, index=False)
return heading_data
else: else:
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}") print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
@ -41,8 +42,9 @@ def scrape_headings(url):
def main(): def main():
url = input("Enter the URL: ")ls url = input("Enter the URL: ")
scrape_headings(url) output_path = 'output.csv'
scrape_headings(url, output_path)
# df = pd.DataFrame(scrape_headings(url)) # df = pd.DataFrame(scrape_headings(url))
search_again = input("Do you want to search again? y/n:").lower() search_again = input("Do you want to search again? y/n:").lower()
if search_again == 'y': if search_again == 'y':