Compare commits

...

4 Commits

Author SHA1 Message Date
fe38c2226f Add output path 2023-12-31 23:50:03 -05:00
8e34d94acd Fix browser agent 2023-12-30 23:17:41 -05:00
125f968a47 Merge branch 'master' of gitea.gorillamail.biz:cid/Web_Scrape 2023-12-30 23:15:47 -05:00
89b432ea2a Add Browser agent 2023-12-30 22:07:12 -05:00
4 changed files with 26 additions and 22 deletions

0
FETCH_HEAD Normal file
View File

0
__init__.py Normal file
View File

0
git Normal file
View File

32
main.py
View File

@ -3,14 +3,17 @@ from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse from urllib.parse import urlparse, urlunparse
import pandas as pd import pandas as pd
headers = {'User-Agent': 'myprogram/1.0'} def scrape_headings(url, output_path):
def scrape_headings(url):
try: try:
# Check if the URL has a scheme (http/https), and add one if missing # Check if the URL has a scheme (http/https), and add one if missing
parsed_url = urlparse(url) parsed_url = urlparse(url)
if not parsed_url.scheme: if not parsed_url.scheme:
url = urlunparse(('http',) + parsed_url[1:]) url = urlunparse(('http',) + parsed_url[1:])
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
# Send an HTTP GET request to the specified URL # Send an HTTP GET request to the specified URL
response = requests.get(url, headers=headers) response = requests.get(url, headers=headers)
@ -22,10 +25,13 @@ def scrape_headings(url):
# Find all the heading elements (h1, h2, h3, etc.) # Find all the heading elements (h1, h2, h3, etc.)
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
# Extract and put to csv the result of headings # Extract the result of headings with their types
heading_list = [heading.text for heading in headings] heading_data = [{"Heading Type": heading.name, "Text": heading.text.strip()} for heading in headings]
df = pd.DataFrame(heading_list)
df.to_csv('output.csv') # Convert to DataFrame and save to CSV
df = pd.DataFrame(heading_data)
df.to_csv(output_path, index=False)
return heading_data
else: else:
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}") print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
@ -34,21 +40,19 @@ def scrape_headings(url):
print(f"An error occurred: {str(e)}") print(f"An error occurred: {str(e)}")
# To check if to do another search for URL
def main_start():
if __name__ == "__main__": def main():
url = input("Enter the URL: ") url = input("Enter the URL: ")
scrape_headings(url) output_path = 'output.csv'
scrape_headings(url, output_path)
# df = pd.DataFrame(scrape_headings(url)) # df = pd.DataFrame(scrape_headings(url))
search_again = input("Do you want to search again? y/n:").lower() search_again = input("Do you want to search again? y/n:").lower()
if search_again == 'y': if search_again == 'y':
# df.to_csv('output.csv') # df.to_csv('output.csv')
main_start() main()
else: else:
# df.to_csv('output.csv') # df.to_csv('output.csv')
exit() exit()
if __name__ == "__main__":
main_start() main()