added revised file for main.py
This commit is contained in:
@ -1,22 +1,54 @@
|
|||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_input():
|
||||||
|
urls = []
|
||||||
|
while True:
|
||||||
|
url = input("Enter a URL (or type 'done' to finish): ")
|
||||||
|
if url.lower() == 'done':
|
||||||
|
break
|
||||||
|
urls.append(url)
|
||||||
|
return urls
|
||||||
|
|
||||||
search_keyword = input("Enter a search keyword: ")
|
|
||||||
|
|
||||||
def scrape_body(url):
|
def scrape_body(url):
|
||||||
response = requests.get(url, headers=headers)
|
try:
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
response = requests.get(url, headers=headers)
|
||||||
body = soup.find('body')
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
body_list.append(body.prettify())
|
body = soup.find('body')
|
||||||
return body_list
|
return body.prettify() if body else "No body content"
|
||||||
|
except requests.RequestException as e:
|
||||||
|
return f"Error: {e}"
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_heads(url):
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
return [h.text for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
|
||||||
|
except requests.RequestException as e:
|
||||||
|
return [f"Error: {e}"]
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
body_list = []
|
urls = get_user_input()
|
||||||
headers = {'User-Agent': 'myprogram/1.0'}
|
headers = {
|
||||||
for i in range(1, 11):
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}
|
||||||
search_url = 'https://www.google.com/search?q={}&start='
|
|
||||||
scrape_body('https://www.google.com/search?q={}&start=' + str(i))
|
all_bodies = []
|
||||||
search_url = search_url.format(search_keyword)
|
all_headers = []
|
||||||
with open('output.txt', 'w+') as f:
|
|
||||||
f.writelines(body_list)
|
for url in urls:
|
||||||
#end
|
body_content = scrape_body(url)
|
||||||
|
all_bodies.append(body_content)
|
||||||
|
|
||||||
|
heads = scrape_heads(url)
|
||||||
|
all_headers.extend([(url, h) for h in heads])
|
||||||
|
|
||||||
|
with open('output.txt', 'w') as f:
|
||||||
|
f.writelines(all_bodies)
|
||||||
|
|
||||||
|
headers_df = pd.DataFrame(all_headers, columns=['URL', 'Heading'])
|
||||||
|
headers_df.to_csv('headers.csv', index=False)
|
||||||
|
|||||||
Reference in New Issue
Block a user