Add pythonProject3

This commit is contained in:
cid
2023-12-21 01:17:03 +08:00
commit c246a09b89
7 changed files with 75 additions and 0 deletions

3
PycharmProjects/pythonProject3/.idea/.gitignore generated vendored Normal file
View File

@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.12 (WebScrape with URL input)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (WebScrape with URL input)" project-jdk-type="Python SDK" />
</project>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/pythonProject3.iml" filepath="$PROJECT_DIR$/.idea/pythonProject3.iml" />
</modules>
</component>
</project>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

Binary file not shown.

View File

@ -0,0 +1,43 @@
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse
def scrape_headings(url):
try:
# Check if the URL has a scheme (http/https), and add one if missing
parsed_url = urlparse(url)
if not parsed_url.scheme:
url = urlunparse(('http',) + parsed_url[1:])
# Send an HTTP GET request to the specified URL
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Find all the heading elements (h1, h2, h3, etc.)
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
# Extract and print the text from the headings
for heading in headings:
print(heading.text)
else:
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
except Exception as e:
print(f"An error occurred: {str(e)}")
def main_start():
if __name__ == "__main__":
url = input("Enter the URL: ")
scrape_headings(url)
search_again = input("Do you want to search again? y/n:").lower()
if search_again == 'y':
main_start()
else:
exit()
main_start()