Skip to content Skip to sidebar Skip to footer

Iterate Over Urls For Webscraping Using Beautifulsoup

This is my code to scrape odds from www.oddsportal.com. import pandas as pd from bs4 import BeautifulSoup as bs from selenium import webdriver import threading from multiprocessing

Solution 1:

I had to make some adjustments to function generate_matches since the returning of certain class names was not reliable. And I removed global statements from that function that I never have never should have had.

import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
import os
import re

classDriver:
    def__init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        # Un-comment next line to supress logging:
        options.add_experimental_option('excludeSwitches', ['enable-logging'])
        self.driver = webdriver.Chrome(options=options)

    def__del__(self):
        self.driver.quit()  # clean up driver when we are cleaned up# print('The driver has been "quitted".')

threadLocal = threading.local()

defcreate_driver():
    the_driver = getattr(threadLocal, 'the_driver', None)
    if the_driver isNone:
        the_driver = Driver()
        setattr(threadLocal, 'the_driver', the_driver)
    return the_driver.driver

classGameData:
    def__init__(self):
        self.date = []
        self.time = []
        self.game = []
        self.score = []
        self.home_odds = []
        self.draw_odds = []
        self.away_odds = []
        self.country = []
        self.league = []

defgenerate_matches(table):
    tr_tags = table.findAll('tr')
    for tr_tag in tr_tags:
        if'class'in tr_tag.attrs and'dark'in tr_tag['class']:
            th_tag = tr_tag.find('th', {'class': 'first2 tl'})
            a_tags = th_tag.findAll('a')
            country = a_tags[0].text
            league = a_tags[1].text
        else:
            td_tags = tr_tag.findAll('td')
            yield td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \
                  td_tags[4].text, td_tags[5].text, country, league

defparse_data(url, return_urls=False):
    browser = create_driver()
    browser.get(url)
    soup = bs(browser.page_source, "lxml")
    div = soup.find('div', {'id': 'col-content'})
    table = div.find('table', {'class': 'table-main'})
    h1 = soup.find('h1').text
    m = re.search(r'\d+ \w+ \d{4}$', h1)
    game_date = m[0]
    game_data = GameData()
    for row in generate_matches(table):
        game_data.date.append(game_date)
        game_data.time.append(row[0])
        game_data.game.append(row[1])
        game_data.score.append(row[2])
        game_data.home_odds.append(row[3])
        game_data.draw_odds.append(row[4])
        game_data.away_odds.append(row[5])
        game_data.country.append(row[6])
        game_data.league.append(row[7])

    if return_urls:
        span = soup.find('span', {'class': 'next-games-date'})
        a_tags = span.findAll('a')
        urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags]
        return game_data, urls
    return game_data

if __name__ == '__main__':
    results = None
    pool = ThreadPool(5) # We will be getting, however, 7 URLs# Get today's data and the Urls for the other days:
    game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
    urls.pop(1) # Remove url for today: We already have the data for that
    game_data_results = pool.imap(parse_data, urls)
    for i inrange(8):
        game_data = game_data_today if i == 1elsenext(game_data_results)
        result = pd.DataFrame(game_data.__dict__)
        if results isNone:
            results = result
        else:
            results = results.append(result, ignore_index=True)

    print(results)
    # print(results.head())# ensure all the drivers are "quitted":del threadLocal
    import gc

    gc.collect()  # a little extra insurance

Prints:

             date   time                                 game score home_odds draw_odds away_odds     country            league
0     07 Sep 2021  00:00             Pachuca W - Monterrey W    0:1      +219      +280      -106      Mexico     Liga MX Women
1     07 Sep 2021  01:05              Millonarios - Patriotas   1:0      -303      +380      +807    Colombia         Primera A
2     07 Sep 2021  02:00        Club Tijuana W - Club Leon W    4:0      -149      +293      +311      Mexico     Liga MX Women
3     07 Sep 2021  08:30         Suzhou Dongwu - Nanjing City   0:0      +165      +190      +177       China        Jia League
4     07 Sep 2021  08:45       Kuching City FC - Sarawak Utd.   1:0      +309      +271      -143    Malaysia    Premier League
...           ...    ...                                  ...   ...       ...       ...       ...         ...               ...
1305  14 Sep 2021  21:45       Central Cordoba - Atl. Tucuman  +192      +217      +146        13   Argentina  Liga Profesional
1306  14 Sep 2021  22:00                  Colo Colo - Everton  -141      +249      +395        11       Chile  Primera Division
1307  14 Sep 2021  23:30   Columbus Crew - New York Red Bulls     -         -         -         1         USA               MLS
1308  14 Sep 2021  23:30            New York City - FC Dallas     -         -         -         1         USA               MLS
1309  14 Sep 2021  23:30             Toronto FC - Inter Miami     -         -         -         1         USA               MLS

[1310 rows x 9 columns]

Solution 2:

I'd suggest you to integrate this method when iterating over urls.

Code snippet-

#assuming you have a list of start_urls
start_urls=['https://www.oddsportal.com/matches/soccer/20210903/']

urls=[]

#get links for Yesterday, today, tomorrow and the next 5 daysfor start_url in start_urls:
    driver.get(start_url)
    html_source=driver.page_source
    soup=BeautifulSoup(html_source,'lxml')
    dates=soup.find('span',class_='next-games-date')
    links=dates.find_all('a')
    forlinkin links:
        urls.append(('https://www.oddsportal.com'+link['href']))

#get data from each linkfor url in urls:
    driver.get(url)
    #function call to parse data#function call to append data

Post a Comment for "Iterate Over Urls For Webscraping Using Beautifulsoup"