Compare commits

4 Commits

Author SHA1 Message Date
fe38c2226f Add output path 2023-12-31 23:50:03 -05:00
8e34d94acd Fix browser agent 2023-12-30 23:17:41 -05:00
125f968a47 Merge branch 'master' of gitea.gorillamail.biz:cid/Web_Scrape 2023-12-30 23:15:47 -05:00
cid
0c9cea5225 Udated main.py "added user-agent headers" 2023-12-29 15:02:15 -05:00
4 changed files with 5 additions and 3 deletions

0
FETCH_HEAD Normal file
View File

0
__init__.py Normal file
View File

0
git Normal file
View File

View File

@ -3,7 +3,7 @@ from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse
import pandas as pd
def scrape_headings(url):
def scrape_headings(url, output_path):
try:
# Check if the URL has a scheme (http/https), and add one if missing
parsed_url = urlparse(url)
@ -30,7 +30,8 @@ def scrape_headings(url):
# Convert to DataFrame and save to CSV
df = pd.DataFrame(heading_data)
df.to_csv('output.csv', index=False)
df.to_csv(output_path, index=False)
return heading_data
else:
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
@ -42,7 +43,8 @@ def scrape_headings(url):
def main():
url = input("Enter the URL: ")
scrape_headings(url)
output_path = 'output.csv'
scrape_headings(url, output_path)
# df = pd.DataFrame(scrape_headings(url))
search_again = input("Do you want to search again? y/n:").lower()
if search_again == 'y':