Browse Source

write rightmove.py script, for scraping data from Right Move site.

master
Craig Oates 3 months ago
parent
commit
a968caab5d
  1. 183
      rightmove.py

183
rightmove.py

@ -0,0 +1,183 @@
import asyncio
import json
import datetime
from typing import List, TypedDict
from urllib.parse import urlencode
from httpx import AsyncClient, Response
from parsel import Selector
class PropertyResult(TypedDict):
"""this is what our result dataset will look like"""
id: str
available: bool
archived: bool
phone: str
bedrooms: int
bathrooms: int
type: str
property_type: str
tags: list
description: str
title: str
subtitle: str
price: str
price_sqft: str
address: dict
latitude: float
longitude: float
features: list
history: dict
photos: list
floorplans: list
agency: dict
industryAffiliations: list
nearest_airports: list
nearest_stations: list
sizings: list
brochures: list
# 1. establish HTTP client with browser-like headers to avoid being blocked
client = AsyncClient(
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6",
},
follow_redirects=True,
http2=True, # enable http2 to reduce block chance
timeout=30,
)
# XXX: we'll fill this in later
def parse_property(data) -> PropertyResult:
"""parse rightmove cache data for proprety information"""
# here we define field name to JMESPath mapping
parse_map = {
"id": "id",
"available": "status.published",
"archived": "status.archived",
"phone": "contactInfo.telephoneNumbers.localNumber",
"bedrooms": "bedrooms",
"bathrooms": "bathrooms",
"type": "transactionType",
"property_type": "propertySubType",
"tags": "tags",
"description": "text.description",
"title": "text.pageTitle",
"subtitle": "text.propertyPhrase",
"price": "prices.primaryPrice",
"price_sqft": "prices.pricePerSqFt",
"address": "address",
"latitude": "location.latitude",
"longitude": "location.longitude",
"features": "keyFeatures",
"history": "listingHistory",
"photos": "images[*].{url: url, caption: caption}",
"floorplans": "floorplans[*].{url: url, caption: caption}",
"agency": """customer.{
id: branchId,
branch: branchName,
company: companyName,
address: displayAddress,
commercial: commercial,
buildToRent: buildToRent,
isNew: isNewHomeDeveloper
}""",
"industryAffiliations": "industryAffiliations[*].name",
"nearest_airports": "nearestAirports[*].{name: name, distance: distance}",
"nearest_stations": "nearestStations[*].{name: name, distance: distance}",
"sizings": "sizings[*].{unit: unit, min: minimumSize, max: maximumSize}",
"brochures": "brochures",
}
results = {}
for key, path in parse_map.items():
value = jmespath.search(path, data)
results[key] = value
return results
# This function will find the PAGE_MODEL javascript variable and extract it
def extract_property(response: Response) -> dict:
"""extract property data from rightmove PAGE_MODEL javascript variable"""
selector = Selector(response.text)
data = selector.xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get()
if not data:
print(f"page {response.url} is not a property listing page")
return
data = data.split("PAGE_MODEL = ", 1)[1].strip()
data = json.loads(data)
return data["propertyData"]
# this is our main scraping function that takes urls and returns the data
async def scrape_properties(urls: List[str]) -> List[dict]:
"""Scrape Rightmove property listings for property data"""
to_scrape = [client.get(url) for url in urls]
properties = []
for response in asyncio.as_completed(to_scrape):
response = await response
properties.append(parse_property(extract_property(response)))
return properties
async def find_locations(query: str) -> List[str]:
"""use rightmove's typeahead api to find location IDs. Returns list of
location IDs in most likely order"""
# rightmove uses two character long tokens so "cornwall" becomes "CO/RN/WA/LL"
tokenize_query = "".join(c + ("/" if i % 2 == 0 else "") for i, c in enumerate(query.upper(), start=1))
url = f"https://www.rightmove.co.uk/typeAhead/uknostreet/{tokenize_query.strip('/')}/"
response = await client.get(url)
data = json.loads(response.text)
return [prediction["locationIdentifier"] for prediction in data["typeAheadLocations"]]
async def scrape_search(location_id: str) -> dict:
RESULTS_PER_PAGE = 24
def make_url(offset: int) -> str:
url = "https://www.rightmove.co.uk/api/_search?"
params = {
"areaSizeUnit": "sqft",
"channel": "RENT", # BUY or RENT
"currencyCode": "GBP",
"includeSSTC": "false",
"index": offset, # page offset
"isFetching": "false",
"locationIdentifier": location_id, #e.g.: "REGION^61294",
"numberOfPropertiesPerPage": RESULTS_PER_PAGE,
"radius": "0.0",
"sortType": "6",
"viewType": "LIST",
}
return url + urlencode(params)
first_page = await client.get(make_url(0))
first_page_data = json.loads(first_page.content)
total_results = int(first_page_data['resultCount'].replace(',', ''))
results = first_page_data['properties']
other_pages = []
# rightmove sets the API limit to 1000 properties
max_api_results = 1000
for offset in range(RESULTS_PER_PAGE, total_results, RESULTS_PER_PAGE):
# stop scraping more pages when the scraper reach the API limit
if offset >= max_api_results:
break
other_pages.append(client.get(make_url(offset)))
for response in asyncio.as_completed(other_pages):
response = await response
data = json.loads(response.text)
results.extend(data['properties'])
return results
async def run():
# Change location to search other areas of the country.
location = 'manchester'
timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
fname = f'raw-data/{timestamp}_right-move-{location}.json'
location_id = (await find_locations(location))[0]
print(location_id)
location_results = await scrape_search(location_id)
with open(fname, 'w', encoding='utf-8') as f:
json.dump(location_results, f, ensure_ascii=False, indent=2)
if __name__ == "__main__":
asyncio.run(run())
Loading…
Cancel
Save