Compare commits
9 Commits
c246a09b89
...
thierry-de
| Author | SHA1 | Date | |
|---|---|---|---|
| 89b432ea2a | |||
| 0c3cfdf0e9 | |||
| 8b837b2d7c | |||
| d0c541a318 | |||
| bc1619f11e | |||
| e2fe8ab8e8 | |||
| 0627bd446b | |||
| 96d8926ac8 | |||
| 3595537ed1 |
BIN
chromedriver
Executable file
BIN
chromedriver
Executable file
Binary file not shown.
56
main.py
Normal file
56
main.py
Normal file
@ -0,0 +1,56 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
import pandas as pd
|
||||
|
||||
def scrape_headings(url):
|
||||
try:
|
||||
# Check if the URL has a scheme (http/https), and add one if missing
|
||||
parsed_url = urlparse(url)
|
||||
if not parsed_url.scheme:
|
||||
url = urlunparse(('http',) + parsed_url[1:])
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
||||
}
|
||||
|
||||
# Send an HTTP GET request to the specified URL
|
||||
response = requests.get(url, headers=headers)
|
||||
|
||||
# Check if the request was successful
|
||||
if response.status_code == 200:
|
||||
# Parse the HTML content using BeautifulSoup
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Find all the heading elements (h1, h2, h3, etc.)
|
||||
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
||||
|
||||
# Extract the result of headings with their types
|
||||
heading_data = [{"Heading Type": heading.name, "Text": heading.text.strip()} for heading in headings]
|
||||
|
||||
# Convert to DataFrame and save to CSV
|
||||
df = pd.DataFrame(heading_data)
|
||||
df.to_csv('output.csv', index=False)
|
||||
|
||||
else:
|
||||
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {str(e)}")
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
url = input("Enter the URL: ")
|
||||
scrape_headings(url)
|
||||
# df = pd.DataFrame(scrape_headings(url))
|
||||
search_again = input("Do you want to search again? y/n:").lower()
|
||||
if search_again == 'y':
|
||||
# df.to_csv('output.csv')
|
||||
main()
|
||||
else:
|
||||
# df.to_csv('output.csv')
|
||||
exit()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user