Compare commits
1 Commits
0c9cea5225
...
thierry-de
| Author | SHA1 | Date | |
|---|---|---|---|
| 89b432ea2a |
27
main.py
27
main.py
@ -10,8 +10,12 @@ def scrape_headings(url):
|
|||||||
if not parsed_url.scheme:
|
if not parsed_url.scheme:
|
||||||
url = urlunparse(('http',) + parsed_url[1:])
|
url = urlunparse(('http',) + parsed_url[1:])
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
||||||
|
}
|
||||||
|
|
||||||
# Send an HTTP GET request to the specified URL
|
# Send an HTTP GET request to the specified URL
|
||||||
response = requests.get(url)
|
response = requests.get(url, headers=headers)
|
||||||
|
|
||||||
# Check if the request was successful
|
# Check if the request was successful
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
@ -21,10 +25,12 @@ def scrape_headings(url):
|
|||||||
# Find all the heading elements (h1, h2, h3, etc.)
|
# Find all the heading elements (h1, h2, h3, etc.)
|
||||||
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
||||||
|
|
||||||
# Extract and put to csv the result of headings
|
# Extract the result of headings with their types
|
||||||
heading_list = [heading.text for heading in headings]
|
heading_data = [{"Heading Type": heading.name, "Text": heading.text.strip()} for heading in headings]
|
||||||
df = pd.DataFrame(heading_list)
|
|
||||||
df.to_csv('output.csv')
|
# Convert to DataFrame and save to CSV
|
||||||
|
df = pd.DataFrame(heading_data)
|
||||||
|
df.to_csv('output.csv', index=False)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
|
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
|
||||||
@ -33,21 +39,18 @@ def scrape_headings(url):
|
|||||||
print(f"An error occurred: {str(e)}")
|
print(f"An error occurred: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
# To check if to do another search for URL
|
|
||||||
def main_start():
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def main():
|
||||||
url = input("Enter the URL: ")
|
url = input("Enter the URL: ")
|
||||||
scrape_headings(url)
|
scrape_headings(url)
|
||||||
# df = pd.DataFrame(scrape_headings(url))
|
# df = pd.DataFrame(scrape_headings(url))
|
||||||
search_again = input("Do you want to search again? y/n:").lower()
|
search_again = input("Do you want to search again? y/n:").lower()
|
||||||
if search_again == 'y':
|
if search_again == 'y':
|
||||||
# df.to_csv('output.csv')
|
# df.to_csv('output.csv')
|
||||||
main_start()
|
main()
|
||||||
else:
|
else:
|
||||||
# df.to_csv('output.csv')
|
# df.to_csv('output.csv')
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
main_start()
|
main()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user