From 9373a2eed1ebc150fd657bcc9ec3848e5a15a5ec Mon Sep 17 00:00:00 2001 From: cid Date: Sun, 31 Dec 2023 17:36:55 +0800 Subject: [PATCH] added revised file for main.py --- lib/python3.12/site-packages/main.py | 62 +++++++++++++++++++++------- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/lib/python3.12/site-packages/main.py b/lib/python3.12/site-packages/main.py index 0cc16d0..3b77072 100644 --- a/lib/python3.12/site-packages/main.py +++ b/lib/python3.12/site-packages/main.py @@ -1,22 +1,54 @@ import requests from bs4 import BeautifulSoup +import pandas as pd + + +def get_user_input(): + urls = [] + while True: + url = input("Enter a URL (or type 'done' to finish): ") + if url.lower() == 'done': + break + urls.append(url) + return urls -search_keyword = input("Enter a search keyword: ") def scrape_body(url): - response = requests.get(url, headers=headers) - soup = BeautifulSoup(response.text, 'html.parser') - body = soup.find('body') - body_list.append(body.prettify()) - return body_list + try: + response = requests.get(url, headers=headers) + soup = BeautifulSoup(response.text, 'html.parser') + body = soup.find('body') + return body.prettify() if body else "No body content" + except requests.RequestException as e: + return f"Error: {e}" + + +def scrape_heads(url): + try: + response = requests.get(url, headers=headers) + soup = BeautifulSoup(response.text, 'html.parser') + return [h.text for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])] + except requests.RequestException as e: + return [f"Error: {e}"] + if __name__ == "__main__": - body_list = [] - headers = {'User-Agent': 'myprogram/1.0'} - for i in range(1, 11): - search_url = 'https://www.google.com/search?q={}&start=' - scrape_body('https://www.google.com/search?q={}&start=' + str(i)) - search_url = search_url.format(search_keyword) - with open('output.txt', 'w+') as f: - f.writelines(body_list) - #end \ No newline at end of file + urls = get_user_input() + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'} + + all_bodies = [] + all_headers = [] + + for url in urls: + body_content = scrape_body(url) + all_bodies.append(body_content) + + heads = scrape_heads(url) + all_headers.extend([(url, h) for h in heads]) + + with open('output.txt', 'w') as f: + f.writelines(all_bodies) + + headers_df = pd.DataFrame(all_headers, columns=['URL', 'Heading']) + headers_df.to_csv('headers.csv', index=False)