From 89b432ea2adcb1f01747241d9afbef7502f65772 Mon Sep 17 00:00:00 2001 From: master Date: Sat, 30 Dec 2023 22:07:12 -0500 Subject: [PATCH] Add Browser agent --- main.py | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/main.py b/main.py index cdda87f..dfc25ec 100644 --- a/main.py +++ b/main.py @@ -10,8 +10,12 @@ def scrape_headings(url): if not parsed_url.scheme: url = urlunparse(('http',) + parsed_url[1:]) + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' + } + # Send an HTTP GET request to the specified URL - response = requests.get(url) + response = requests.get(url, headers=headers) # Check if the request was successful if response.status_code == 200: @@ -21,10 +25,12 @@ def scrape_headings(url): # Find all the heading elements (h1, h2, h3, etc.) headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) - # Extract and put to csv the result of headings - heading_list = [heading.text for heading in headings] - df = pd.DataFrame(heading_list) - df.to_csv('output.csv') + # Extract the result of headings with their types + heading_data = [{"Heading Type": heading.name, "Text": heading.text.strip()} for heading in headings] + + # Convert to DataFrame and save to CSV + df = pd.DataFrame(heading_data) + df.to_csv('output.csv', index=False) else: print(f"Failed to retrieve content from {url}. Status code: {response.status_code}") @@ -33,21 +39,18 @@ def scrape_headings(url): print(f"An error occurred: {str(e)}") -# To check if to do another search for URL -def main_start(): - if __name__ == "__main__": - url = input("Enter the URL: ") - scrape_headings(url) - # df = pd.DataFrame(scrape_headings(url)) - search_again = input("Do you want to search again? y/n:").lower() - if search_again == 'y': - # df.to_csv('output.csv') - main_start() - else: - # df.to_csv('output.csv') - exit() - - -main_start() +def main(): + url = input("Enter the URL: ") + scrape_headings(url) + # df = pd.DataFrame(scrape_headings(url)) + search_again = input("Do you want to search again? y/n:").lower() + if search_again == 'y': + # df.to_csv('output.csv') + main() + else: + # df.to_csv('output.csv') + exit() +if __name__ == "__main__": + main()