From 9373a2eed1ebc150fd657bcc9ec3848e5a15a5ec Mon Sep 17 00:00:00 2001
From: cid <ccsbinas@gmail.com>
Date: Sun, 31 Dec 2023 17:36:55 +0800
Subject: [PATCH] added revised file for main.py

---
 lib/python3.12/site-packages/main.py | 62 +++++++++++++++++++++-------
 1 file changed, 47 insertions(+), 15 deletions(-)

diff --git a/lib/python3.12/site-packages/main.py b/lib/python3.12/site-packages/main.py
index 0cc16d0..3b77072 100644
--- a/lib/python3.12/site-packages/main.py
+++ b/lib/python3.12/site-packages/main.py
@@ -1,22 +1,54 @@
 import requests
 from bs4 import BeautifulSoup
+import pandas as pd
+
+
+def get_user_input():
+    urls = []
+    while True:
+        url = input("Enter a URL (or type 'done' to finish): ")
+        if url.lower() == 'done':
+            break
+        urls.append(url)
+    return urls
 
-search_keyword = input("Enter a search keyword: ")
 
 def scrape_body(url):
-    response = requests.get(url, headers=headers)
-    soup = BeautifulSoup(response.text, 'html.parser')
-    body = soup.find('body')
-    body_list.append(body.prettify())
-    return body_list
+    try:
+        response = requests.get(url, headers=headers)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        body = soup.find('body')
+        return body.prettify() if body else "No body content"
+    except requests.RequestException as e:
+        return f"Error: {e}"
+
+
+def scrape_heads(url):
+    try:
+        response = requests.get(url, headers=headers)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        return [h.text for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
+    except requests.RequestException as e:
+        return [f"Error: {e}"]
+
 
 if __name__ == "__main__":
-    body_list = []
-    headers = {'User-Agent': 'myprogram/1.0'}
-    for i in range(1, 11):
-        search_url = 'https://www.google.com/search?q={}&start='
-        scrape_body('https://www.google.com/search?q={}&start=' + str(i))
-        search_url = search_url.format(search_keyword)
-        with open('output.txt', 'w+') as f:
-            f.writelines(body_list)
-        #end
\ No newline at end of file
+    urls = get_user_input()
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}
+
+    all_bodies = []
+    all_headers = []
+
+    for url in urls:
+        body_content = scrape_body(url)
+        all_bodies.append(body_content)
+
+        heads = scrape_heads(url)
+        all_headers.extend([(url, h) for h in heads])
+
+    with open('output.txt', 'w') as f:
+        f.writelines(all_bodies)
+
+    headers_df = pd.DataFrame(all_headers, columns=['URL', 'Heading'])
+    headers_df.to_csv('headers.csv', index=False)