The request might be blocked if using requests
as default user-agent
in requests
library is a python-requests
.
An additional step could be to rotate user-agent
, for example, to switch between PC, mobile, and tablet, as well as between browsers e.g. Chrome, Firefox, Safari, Edge and so on. User-agent rotation can be used in combo with proxy rotation (ideally residential) + CAPTCHA solver.
At the moment, the Google Play Store has been heavily redesigned, now it is almost completely dynamic. However, all the data can be extracted from the inline JSON.
For scraping dynamic sites, selenium
or playwright
webdriver is great. However, in our case, using BeautifulSoup and regular expression is faster to extract data from the page source.
We must extract certain <script>
element from all <script>
elements in the HTML, by using regular expression
, and transform in to a dict
with json.loads()
:
# https://regex101.com/r/zOMOfo/1
basic_app_info = json.loads(re.findall(r"<script nonce=\"\w+\" type=\"application/ld\+json\">({.*?)</script>", str(soup.select("script")[11]), re.DOTALL)[0])
Check code in online IDE.
from bs4 import BeautifulSoup
import requests, re, json, lxml
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
}
# https://requests.readthedocs.io/en/latest/user/quickstart/#passing-parameters-in-urls
params = {
"id": "com.nintendo.zara", # app name
"gl": "US", # country of the search
"hl": "en_GB" # language of the search
}
html = requests.get("https://play.google.com/store/apps/details", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
# where all app data will be stored
app_data = {
"basic_info":{
"developer":{},
"downloads_info": {}
}
}
# [11] index is a basic app information
# https://regex101.com/r/zOMOfo/1
basic_app_info = json.loads(re.findall(r"<script nonce=\"\w+\" type=\"application/ld\+json\">({.*?)</script>", str(soup.select("script")[11]), re.DOTALL)[0])
# https://regex101.com/r/6Reb0M/1
additional_basic_info = re.search(fr"<script nonce=\"\w+\">AF_initDataCallback\(.*?(\"{basic_app_info.get('name')}\".*?)\);<\/script>",
str(soup.select("script")), re.M|re.DOTALL).group(1)
app_data["basic_info"]["name"] = basic_app_info.get("name")
app_data["basic_info"]["type"] = basic_app_info.get("@type")
app_data["basic_info"]["url"] = basic_app_info.get("url")
app_data["basic_info"]["description"] = basic_app_info.get("description").replace("\n", "") # replace new line character to nothing
app_data["basic_info"]["application_category"] = basic_app_info.get("applicationCategory")
app_data["basic_info"]["operating_system"] = basic_app_info.get("operatingSystem")
app_data["basic_info"]["thumbnail"] = basic_app_info.get("image")
app_data["basic_info"]["content_rating"] = basic_app_info.get("contentRating")
app_data["basic_info"]["rating"] = round(float(basic_app_info.get("aggregateRating").get("ratingValue")), 1) # 4.287856 -> 4.3
app_data["basic_info"]["reviews"] = basic_app_info.get("aggregateRating").get("ratingCount")
app_data["basic_info"]["reviews"] = basic_app_info.get("aggregateRating").get("ratingCount")
app_data["basic_info"]["price"] = basic_app_info["offers"][0]["price"]
app_data["basic_info"]["developer"]["name"] = basic_app_info.get("author").get("name")
app_data["basic_info"]["developer"]["url"] = basic_app_info.get("author").get("url")
# https://regex101.com/r/C1WnuO/1
app_data["basic_info"]["developer"]["email"] = re.search(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", additional_basic_info).group(0)
# https://regex101.com/r/Y2mWEX/1 (a few matches but re.search always matches the first occurence)
app_data["basic_info"]["release_date"] = re.search(r"\d{1,2}\s[A-Z-a-z]{3}\s\d{4}", additional_basic_info).group(0)
# https://regex101.com/r/7yxDJM/1
app_data["basic_info"]["downloads_info"]["long_form_not_formatted"] = re.search(r"\"(\d+,?\d+,?\d+\+)\"\,(\d+),(\d+),\"(\d+M\+)\"", additional_basic_info).group(1)
app_data["basic_info"]["downloads_info"]["long_form_formatted"] = re.search(r"\"(\d+,?\d+,?\d+\+)\"\,(\d+),(\d+),\"(\d+M\+)\"", additional_basic_info).group(2)
app_data["basic_info"]["downloads_info"]["as_displayed_short_form"] = re.search(r"\"(\d+,?\d+,?\d+\+)\"\,(\d+),(\d+),\"(\d+M\+)\"", additional_basic_info).group(4)
app_data["basic_info"]["downloads_info"]["actual_downloads"] = re.search(r"\"(\d+,?\d+,?\d+\+)\"\,(\d+),(\d+),\"(\d+M\+)\"", additional_basic_info).group(3)
# https://regex101.com/r/jjsdUP/1
# [2:] skips 2 PEGI logo thumbnails and extracts only app images
app_data["basic_info"]["images"] = re.findall(r",\[\d{3,4},\d{3,4}\],.*?(https.*?)\"", additional_basic_info)[2:]
try:
# https://regex101.com/r/C1WnuO/1
app_data["basic_info"]["video_trailer"] = "".join(re.findall(r"\"(https:\/\/play-games\.\w+\.com\/vp\/mp4\/\d+x\d+\/\S+\.mp4)\"", additional_basic_info)[0])
except:
app_data["basic_info"]["video_trailer"] = None
print(json.dumps(app_data, indent=2, ensure_ascii=False))
Example output:
[
{
"basic_info": {
"developer": {
"name": "Nintendo Co., Ltd.",
"url": "https://supermariorun.com/",
"email": "[email protected]"
},
"downloads_info": {
"long_form_not_formatted": "100,000,000+",
"long_form_formatted": "100000000",
"as_displayed_short_form": "100M+",
"actual_downloads": "213064462"
},
"name": "Super Mario Run",
"type": "SoftwareApplication",
"url": "https://play.google.com/store/apps/details/Super_Mario_Run?id=com.nintendo.zara&hl=en_GB&gl=US",
"description": "Control Mario with just a tap!",
"application_category": "GAME_ACTION",
"operating_system": "ANDROID",
"thumbnail": "https://play-lh.googleusercontent.com/3ZKfMRp_QrdN-LzsZTbXdXBH-LS1iykSg9ikNq_8T2ppc92ltNbFxS-tORxw2-6kGA",
"content_rating": "Everyone",
"rating": 4.0,
"reviews": "1645926",
"price": "0",
"release_date": "22 Mar 2017",
"images": [
"https://play-lh.googleusercontent.com/yT8ZCQHNB_MGT9Oc6mC5_mQS5vZ-5A4fvKQHHOl9NBy8yWGbM5-EFG_uISOXmypBYQ6G",
"https://play-lh.googleusercontent.com/AvRrlEpV8TCryInAnA__FcXqDu5d3i-XrUp8acW2LNmzkU-rFXkAKgmJPA_4AHbNjyY",
"https://play-lh.googleusercontent.com/AESbAa4QFa9-lVJY0vmAWyq2GXysv5VYtpPuDizOQn40jS9Z_ji8HXHA5hnOIzaf_w",
"https://play-lh.googleusercontent.com/KOCWy63UI2p7Fc65_X5gnIHsErEt7gpuKoD-KcvpGfRSHp-4k8YBGyPPopnrNQpdiQ",
"https://play-lh.googleusercontent.com/iDJagD2rKMJ92hNUi5WS2S_mQ6IrKkz6-G8c_zHNU9Ck8XMrZZP-1S_KkDsA6KDJ9No",
# ...
]
A possible good solution with shorter and simpler code could be Google Play Store API from SerpApi. It's a paid API with a free plan.
The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
SerpApi simple code example:
from serpapi import GoogleSearch
import os, json
params = {
"api_key": os.getenv("API_KEY"), # your serpapi api key
"engine": "google_play_product", # parsing engine
"store": "apps", # app page
"gl": "us", # country of the search
"product_id": "com.nintendo.zara", # low review count example to show it exits the while loop
"all_reviews": "true" # shows all reviews
}
search = GoogleSearch(params) # where data extraction happens
results = search.get_dict()
print(json.dumps(results["product_info"], indent=2, ensure_ascii=False))
print(json.dumps(results["media"], indent=2, ensure_ascii=False))
# other data
Output exactly the same as in the previous solution.
There's a Scrape Google Play Store App in Python blog post if you need a little bit more code explanation.
Disclaimer, I work for SerpApi.