xxxxxxxxxx
import os
import requests
from bs4 import BeautifulSoup
url = "https://www.google.com/"
reponse = requests.get(url)
if reponse.ok:
soup = BeautifulSoup(reponse.text, "lxml")
title = str(soup.find("title"))
title = title.replace("<title>", "")
title = title.replace("</title>", "")
print("The title is : " + str(title))
os.system("pause")
xxxxxxxxxx
#Scrapes Python's URL, version number and logo from its Wikipedia page:
# $ pip3 install requests beautifulsoup4
import requests, bs4, os, sys
URL = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
try:
html = requests.get(URL).text
document = bs4.BeautifulSoup(html, 'html.parser')
table = document.find('table', class_='infobox vevent')
python_url = table.find('th', text='Website').next_sibling.a['href']
version = table.find('th', text='Stable release').next_sibling.strings.__next__()
logo_url = table.find('img')['src']
logo = requests.get(f'https:{logo_url}').content
filename = os.path.basename(logo_url)
with open(filename, 'wb') as file:
file.write(logo)
print(f'{python_url}, {version}, file://{os.path.abspath(filename)}')
except requests.exceptions.ConnectionError:
print("You've got problems with connection.", file=sys.stderr)
xxxxxxxxxx
# Import the required libraries
import requests
from bs4 import BeautifulSoup
# Send a request to the website you want to scrape
response = requests.get('https://example.com')
# Parse the HTML content of the webpage using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Find the desired element(s) within the parsed HTML
# For example, let's extract all the links on the page
links = soup.find_all('a')
# Iterate over the extracted links and print their text and URLs
for link in links:
print(f"Text: {link.text}")
print(f"URL: {link['href']}")
xxxxxxxxxx
>>> from bs4 import BeautifulSoup
>>> raw_html = open('contrived.html').read()
>>> html = BeautifulSoup(raw_html, 'html.parser')
>>> for p in html.select('p'):
if p['id'] == 'walrus':
print(p.text)
'I am the walrus'
import requests
from bs4 import BeautifulSoup
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
xxxxxxxxxx
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
# URL of the website to scrape
url = "https://www.imdb.com/chart/top"
# Send an HTTP GET request to the website
response = requests.get(url)
# Parse the HTML code using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the relevant information from the HTML code
movies = []
for row in soup.select('tbody.lister-list tr'):
title = row.find('td', class_='titleColumn').find('a').get_text()
year = row.find('td', class_='titleColumn').find('span', class_='secondaryInfo').get_text()[1:-1]
rating = row.find('td', class_='ratingColumn imdbRating').find('strong').get_text()
movies.append([title, year, rating])
# Store the information in a pandas dataframe
df = pd.DataFrame(movies, columns=['Title', 'Year', 'Rating'])
# Add a delay between requests to avoid overwhelming the website with requests
time.sleep(1)
xxxxxxxxxx
import requests
from bs4 import BeautifulSoup
url = 'YOUR_WEBSITE_URL_HERE'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the elements you need from the website using BeautifulSoup
# For example, to extract all the links on the page:
for link in soup.find_all('a'):
print(link.get('href'))
# You can customize this code to scrape specific elements or data from the JavaScript website.