Compare commits

...

9 Commits

Author SHA1 Message Date
89b432ea2a Add Browser agent 2023-12-30 22:07:12 -05:00
cid
0c3cfdf0e9 Merge remote-tracking branch 'origin/master' 2023-12-23 23:58:20 +08:00
cid
8b837b2d7c included csv function 2023-12-23 23:55:11 +08:00
cid
d0c541a318 added csv 2023-12-23 23:40:41 +08:00
cid
bc1619f11e added a csv output function 2023-12-23 23:34:57 +08:00
cid
e2fe8ab8e8 added csv function 2023-12-23 23:14:12 +08:00
cid
0627bd446b Merge remote-tracking branch 'origin/main' 2023-12-23 09:35:55 +08:00
cid
96d8926ac8 added chromedriver and main.py 2023-12-23 09:00:11 +08:00
3595537ed1 Initial commit 2023-12-22 16:53:14 -08:00
3 changed files with 57 additions and 0 deletions

1
README.md Normal file
View File

@ -0,0 +1 @@
# WebScrape

BIN
chromedriver Executable file

Binary file not shown.

56
main.py Normal file
View File

@ -0,0 +1,56 @@
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse
import pandas as pd
def scrape_headings(url):
try:
# Check if the URL has a scheme (http/https), and add one if missing
parsed_url = urlparse(url)
if not parsed_url.scheme:
url = urlunparse(('http',) + parsed_url[1:])
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
# Send an HTTP GET request to the specified URL
response = requests.get(url, headers=headers)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Find all the heading elements (h1, h2, h3, etc.)
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
# Extract the result of headings with their types
heading_data = [{"Heading Type": heading.name, "Text": heading.text.strip()} for heading in headings]
# Convert to DataFrame and save to CSV
df = pd.DataFrame(heading_data)
df.to_csv('output.csv', index=False)
else:
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
except Exception as e:
print(f"An error occurred: {str(e)}")
def main():
url = input("Enter the URL: ")
scrape_headings(url)
# df = pd.DataFrame(scrape_headings(url))
search_again = input("Do you want to search again? y/n:").lower()
if search_again == 'y':
# df.to_csv('output.csv')
main()
else:
# df.to_csv('output.csv')
exit()
if __name__ == "__main__":
main()