Scraping the Pokemon’s database
I wanted to retrieve infos from the Pokemon’s database. Here is a little python script that will get the basics infos of each pokemon on the main page and then will follow specific links to retrieve more advanced statistics.
"""get all pokemons' names & infos from the web page and all its links,
then save results in a csv file for later analysis with a jupyter notebook"""
import os, requests, pickle, html5lib
from bs4 import BeautifulSoup
file_name = 'scraped_pokemon_page.pickle'
# all parsed infos = columns' names of the csv file
def scrape_general_page(base_url):
"""Scrape the web page with general infos on all pokemon then if succeed, write a binary file for later use
receive a URL - return request's result """
if not os.path.exists(file_name):
result = requests.get(base_url + "/pokedex/national")
assert result.status_code == 200, print(f'Attempt to retrieve web page failed - result code {result.status_code}')
with open(file_name, 'wb') as f:
pickle.dump(result, f)
with open(file_name, 'rb') as f:
result = pickle.load(f)
return result
def create_bs4_object(result):
"""Receive a request's result & return a created bs4 object for parsing infos"""
return BeautifulSoup(result.content, 'html5lib')
def get_gen_infos(ic):
"""Receive a specific infocard & return general infos parsed"""
id_nb = ic.small.get_text()
name = ic.find_all('a')[0].get_text()
link = ic.find_all('a')[0]['href']
types_list = ic.find_all('a')
if len(types_list) == 2: # case of only 1 type - because include the name
type_1, type_2 = ic.find_all('a')[1].get_text(), 'Nan'
elif len(types_list) == 3:
type_1, type_2 = ic.find_all('a')[1].get_text(), ic.find_all('a')[2].get_text()
return id_nb, name, type_1, type_2, link
def get_stats(soup):
data_species = soup.find("th", text="Species").next_sibling.next_sibling.string
data_height = soup.find("th", text="Height").next_sibling.next_sibling.string.split()[0]
data_weight = soup.find("th", text="Weight").next_sibling.next_sibling.string.split()[0]
data_abilities = soup.find("th", text="Abilities").next_sibling.next_sibling.a.text
training_catch_rate = soup.find("th", text="Catch rate").next_sibling.next_sibling.text.split()[0]
training_base_exp = soup.find("th", text="Base Exp.").next_sibling.next_sibling.text
training_growth_rate = soup.find("th", text="Growth Rate").next_sibling.next_sibling.text
breeding_gender = soup.find("th", text="Gender").next_sibling.next_sibling.text
stats_hp = soup.find("th", text="HP").next_sibling.next_sibling.text
stats_attack = soup.find("th", text="Attack").next_sibling.next_sibling.text
stats_defense = soup.find("th", text="Defense").next_sibling.next_sibling.text
stats_sp_atk = soup.find("th", text="Sp. Atk").next_sibling.next_sibling.text
stats_sp_def = soup.find("th", text="Sp. Def").next_sibling.next_sibling.text
stats_speed = soup.find("th", text="Speed").next_sibling.next_sibling.text
stats_total = soup.find("th", text="Total").next_sibling.next_sibling.text
return data_species, data_height, data_weight, data_abilities, training_catch_rate, training_base_exp, \
training_growth_rate, breeding_gender, stats_hp, stats_attack, stats_defense, stats_sp_atk, stats_sp_def, \
stats_speed, stats_total
def parse_infocards(soup, pokemon_infos):
"""Receive the bs4 object of the main page, parse all infocards and call others func to retrieve infos
Return a coma separated string of the whole db"""
# get a list of all infocards
infocards = soup.find_all("span", class_="infocard-lg-data text-muted")
# write headers / columns' names
with open('pokemon_db.csv', 'w') as f:
# write infos line by line
with open('pokemon_db.csv', 'a') as f:
for ic in infocards:
gen_infos_list = list(get_gen_infos(ic))
ic_infos = "\n" + ";".join(gen_infos_list)
# request for linked page to get specific stats
r = requests.get(BASE_URL + gen_infos_list[-1])
if r.status_code != 200:
ic_infos += ";Nan" * (len(pokemon_infos) - len(gen_infos_list))
other_soup = BeautifulSoup(r.content, 'html5lib')
stats = list(get_stats(other_soup))
ic_infos += ";" + ";".join(stats)
def main():
result = scrape_general_page(BASE_URL)
soup = create_bs4_object(result)
pokemon_infos = """id_nb;name;type_1;type_2;link;data_species;data_height;data_weight;data_abilities;training_catch_rate;
training_base_exp;training_growth_rate; breeding_gender;stats_hp;stats_attack;stats_defense; stats_sp_atk;stats_sp_def;
parse_infocards(soup, pokemon_infos)
if __name__ == "__main__":