Compare commits
4 Commits
thierry-de
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| fe38c2226f | |||
| 8e34d94acd | |||
| 125f968a47 | |||
| 0c9cea5225 |
0
FETCH_HEAD
Normal file
0
FETCH_HEAD
Normal file
0
__init__.py
Normal file
0
__init__.py
Normal file
8
main.py
8
main.py
@ -3,7 +3,7 @@ from bs4 import BeautifulSoup
|
|||||||
from urllib.parse import urlparse, urlunparse
|
from urllib.parse import urlparse, urlunparse
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
def scrape_headings(url):
|
def scrape_headings(url, output_path):
|
||||||
try:
|
try:
|
||||||
# Check if the URL has a scheme (http/https), and add one if missing
|
# Check if the URL has a scheme (http/https), and add one if missing
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
@ -30,7 +30,8 @@ def scrape_headings(url):
|
|||||||
|
|
||||||
# Convert to DataFrame and save to CSV
|
# Convert to DataFrame and save to CSV
|
||||||
df = pd.DataFrame(heading_data)
|
df = pd.DataFrame(heading_data)
|
||||||
df.to_csv('output.csv', index=False)
|
df.to_csv(output_path, index=False)
|
||||||
|
return heading_data
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
|
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
|
||||||
@ -42,7 +43,8 @@ def scrape_headings(url):
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
url = input("Enter the URL: ")
|
url = input("Enter the URL: ")
|
||||||
scrape_headings(url)
|
output_path = 'output.csv'
|
||||||
|
scrape_headings(url, output_path)
|
||||||
# df = pd.DataFrame(scrape_headings(url))
|
# df = pd.DataFrame(scrape_headings(url))
|
||||||
search_again = input("Do you want to search again? y/n:").lower()
|
search_again = input("Do you want to search again? y/n:").lower()
|
||||||
if search_again == 'y':
|
if search_again == 'y':
|
||||||
|
|||||||
Reference in New Issue
Block a user