Iterate Over Urls For Webscraping Using Beautifulsoup
This is my code to scrape odds from www.oddsportal.com. import pandas as pd from bs4 import BeautifulSoup as bs from selenium import webdriver import threading from multiprocessing
Solution 1:
I had to make some adjustments to function generate_matches
since the returning of certain class names was not reliable. And I removed global statements from that function that I never have never should have had.
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
import os
import re
classDriver:
def__init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def__del__(self):
self.driver.quit() # clean up driver when we are cleaned up# print('The driver has been "quitted".')
threadLocal = threading.local()
defcreate_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver isNone:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
classGameData:
def__init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
defgenerate_matches(table):
tr_tags = table.findAll('tr')
for tr_tag in tr_tags:
if'class'in tr_tag.attrs and'dark'in tr_tag['class']:
th_tag = tr_tag.find('th', {'class': 'first2 tl'})
a_tags = th_tag.findAll('a')
country = a_tags[0].text
league = a_tags[1].text
else:
td_tags = tr_tag.findAll('td')
yield td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \
td_tags[4].text, td_tags[5].text, country, league
defparse_data(url, return_urls=False):
browser = create_driver()
browser.get(url)
soup = bs(browser.page_source, "lxml")
div = soup.find('div', {'id': 'col-content'})
table = div.find('table', {'class': 'table-main'})
h1 = soup.find('h1').text
m = re.search(r'\d+ \w+ \d{4}$', h1)
game_date = m[0]
game_data = GameData()
for row in generate_matches(table):
game_data.date.append(game_date)
game_data.time.append(row[0])
game_data.game.append(row[1])
game_data.score.append(row[2])
game_data.home_odds.append(row[3])
game_data.draw_odds.append(row[4])
game_data.away_odds.append(row[5])
game_data.country.append(row[6])
game_data.league.append(row[7])
if return_urls:
span = soup.find('span', {'class': 'next-games-date'})
a_tags = span.findAll('a')
urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags]
return game_data, urls
return game_data
if __name__ == '__main__':
results = None
pool = ThreadPool(5) # We will be getting, however, 7 URLs# Get today's data and the Urls for the other days:
game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
urls.pop(1) # Remove url for today: We already have the data for that
game_data_results = pool.imap(parse_data, urls)
for i inrange(8):
game_data = game_data_today if i == 1elsenext(game_data_results)
result = pd.DataFrame(game_data.__dict__)
if results isNone:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
# print(results.head())# ensure all the drivers are "quitted":del threadLocal
import gc
gc.collect() # a little extra insurance
Prints:
date time game score home_odds draw_odds away_odds country league
0 07 Sep 2021 00:00 Pachuca W - Monterrey W 0:1 +219 +280 -106 Mexico Liga MX Women
1 07 Sep 2021 01:05 Millonarios - Patriotas 1:0 -303 +380 +807 Colombia Primera A
2 07 Sep 2021 02:00 Club Tijuana W - Club Leon W 4:0 -149 +293 +311 Mexico Liga MX Women
3 07 Sep 2021 08:30 Suzhou Dongwu - Nanjing City 0:0 +165 +190 +177 China Jia League
4 07 Sep 2021 08:45 Kuching City FC - Sarawak Utd. 1:0 +309 +271 -143 Malaysia Premier League
... ... ... ... ... ... ... ... ... ...
1305 14 Sep 2021 21:45 Central Cordoba - Atl. Tucuman +192 +217 +146 13 Argentina Liga Profesional
1306 14 Sep 2021 22:00 Colo Colo - Everton -141 +249 +395 11 Chile Primera Division
1307 14 Sep 2021 23:30 Columbus Crew - New York Red Bulls - - - 1 USA MLS
1308 14 Sep 2021 23:30 New York City - FC Dallas - - - 1 USA MLS
1309 14 Sep 2021 23:30 Toronto FC - Inter Miami - - - 1 USA MLS
[1310 rows x 9 columns]
Solution 2:
I'd suggest you to integrate this method when iterating over urls.
Code snippet-
#assuming you have a list of start_urls
start_urls=['https://www.oddsportal.com/matches/soccer/20210903/']
urls=[]
#get links for Yesterday, today, tomorrow and the next 5 daysfor start_url in start_urls:
driver.get(start_url)
html_source=driver.page_source
soup=BeautifulSoup(html_source,'lxml')
dates=soup.find('span',class_='next-games-date')
links=dates.find_all('a')
forlinkin links:
urls.append(('https://www.oddsportal.com'+link['href']))
#get data from each linkfor url in urls:
driver.get(url)
#function call to parse data#function call to append data
Post a Comment for "Iterate Over Urls For Webscraping Using Beautifulsoup"