Breaking News: Grepper is joining You.com. Read the official announcement!

how to use selenium to scrap data from newspaper

Lonely Lark answered on August 15, 2022 Popularity 8/10 Helpfulness 4/10

answer how to use selenium to scrap data from newspaper

how to use selenium to scrap data from newspaper

Comment

Tip Lonely Lark 1 GREPCC

import time
import requests
from bs4 import BeautifulSoup
import json
import string
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

base = "https://www.nytimes.com"
browser = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
wait = WebDriverWait(browser, 10)
browser.get('https://www.nytimes.com/search?endDate=20190331&query=cybersecurity&sort=newest&startDate=20180401')

while True:
    try:
        time.sleep(1)
        show_more = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[@type="button"][contains(.,"Show More")]')))  
        show_more.click()
    except Exception as e:
            print(e)
            break    

soup = BeautifulSoup(browser.page_source,'lxml')
search_results = soup.find('ol', {'data-testid':'search-results'})

links = search_results.find_all('a')
for link in links:
    link_url = link['href']

    title = link.find('h4').text
    date = link.find_next('time').text
    print(date + ': '+ title + '\n')

    response = requests.get(base + link_url)
    soup_link = BeautifulSoup(response.text, 'html.parser')
    scripts = soup_link.find_all('script')
    for script in scripts:
        if 'window.__preloadedData = ' in script.text:
            jsonStr = script.text
            jsonStr = jsonStr.split('window.__preloadedData = ')[-1]
            jsonStr = jsonStr.rsplit(';',1)[0]

            jsonData = json.loads(jsonStr)

            article = []
            for k, v in jsonData['initialState'].items():
                w=1
                try:
                    if v['__typename'] == 'TextInline':
                        article.append(v['text'])
                        #print (v['text'])
                except:
                    continue
            article = [ each.strip() for each in article ]
            article = ''.join([('' if c in string.punctuation else ' ')+c for c in article]).strip()
    print (article + '\n')

print("Complete")

browser.quit()

xxxxxxxxxx

import time

import requests

from bs4 import BeautifulSoup

import json

import string

from selenium import webdriver

from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

base = "https://www.nytimes.com"

browser = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')

wait = WebDriverWait(browser, 10)

browser.get('https://www.nytimes.com/search?endDate=20190331&query=cybersecurity&sort=newest&startDate=20180401')

while True:

    try:

        time.sleep(1)

        show_more = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[@type="button"][contains(.,"Show More")]')))

        show_more.click()

    except Exception as e:

            print(e)

            break

soup = BeautifulSoup(browser.page_source,'lxml')

search_results = soup.find('ol', {'data-testid':'search-results'})

links = search_results.find_all('a')

for link in links:

    link_url = link['href']

    title = link.find('h4').text

    date = link.find_next('time').text

    print(date + ': '+ title + '\n')

    response = requests.get(base + link_url)

    soup_link = BeautifulSoup(response.text, 'html.parser')

    scripts = soup_link.find_all('script')

    for script in scripts:

        if 'window.__preloadedData = ' in script.text:

            jsonStr = script.text

            jsonStr = jsonStr.split('window.__preloadedData = ')[-1]

            jsonStr = jsonStr.rsplit(';',1)[0]

            jsonData = json.loads(jsonStr)

            article = []

            for k, v in jsonData['initialState'].items():

w=1

                try:

                    if v['__typename'] == 'TextInline':

                        article.append(v['text'])

                        #print (v['text'])

                except:

                    continue

            article = [ each.strip() for each in article ]

            article = ''.join([('' if c in string.punctuation else ' ')+c for c in article]).strip()

    print (article + '\n')

print("Complete")

browser.quit()

Popularity 8/10 Helpfulness 4/10 Language python

Source: stackoverflow.com

Tags: python selenium

Link to this answer
Share Copy Link

Contributed on Aug 15 2022

Lonely Lark

0 Answers Avg Quality 2/10

how to use selenium to scrap data from newspaper

Contents

More Related Answers

how to use selenium to scrap data from newspaper

Grepper

Documentation

Social

Legal

Contact

Oops, You will need to install Grepper and log-in to perform this action.