From a968caab5d7ada6a25ea8ad1cec012fc4ddd9acf Mon Sep 17 00:00:00 2001 From: Craig Oates Date: Mon, 19 Feb 2024 00:46:14 +0000 Subject: [PATCH] write rightmove.py script, for scraping data from Right Move site. --- rightmove.py | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 rightmove.py diff --git a/rightmove.py b/rightmove.py new file mode 100644 index 0000000..635f0c1 --- /dev/null +++ b/rightmove.py @@ -0,0 +1,183 @@ +import asyncio +import json +import datetime +from typing import List, TypedDict +from urllib.parse import urlencode +from httpx import AsyncClient, Response +from parsel import Selector + +class PropertyResult(TypedDict): + """this is what our result dataset will look like""" + id: str + available: bool + archived: bool + phone: str + bedrooms: int + bathrooms: int + type: str + property_type: str + tags: list + description: str + title: str + subtitle: str + price: str + price_sqft: str + address: dict + latitude: float + longitude: float + features: list + history: dict + photos: list + floorplans: list + agency: dict + industryAffiliations: list + nearest_airports: list + nearest_stations: list + sizings: list + brochures: list + +# 1. establish HTTP client with browser-like headers to avoid being blocked +client = AsyncClient( + headers={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6", + }, + follow_redirects=True, + http2=True, # enable http2 to reduce block chance + timeout=30, +) + +# XXX: we'll fill this in later +def parse_property(data) -> PropertyResult: + """parse rightmove cache data for proprety information""" + # here we define field name to JMESPath mapping + parse_map = { + "id": "id", + "available": "status.published", + "archived": "status.archived", + "phone": "contactInfo.telephoneNumbers.localNumber", + "bedrooms": "bedrooms", + "bathrooms": "bathrooms", + "type": "transactionType", + "property_type": "propertySubType", + "tags": "tags", + "description": "text.description", + "title": "text.pageTitle", + "subtitle": "text.propertyPhrase", + "price": "prices.primaryPrice", + "price_sqft": "prices.pricePerSqFt", + "address": "address", + "latitude": "location.latitude", + "longitude": "location.longitude", + "features": "keyFeatures", + "history": "listingHistory", + "photos": "images[*].{url: url, caption: caption}", + "floorplans": "floorplans[*].{url: url, caption: caption}", + "agency": """customer.{ + id: branchId, + branch: branchName, + company: companyName, + address: displayAddress, + commercial: commercial, + buildToRent: buildToRent, + isNew: isNewHomeDeveloper + }""", + "industryAffiliations": "industryAffiliations[*].name", + "nearest_airports": "nearestAirports[*].{name: name, distance: distance}", + "nearest_stations": "nearestStations[*].{name: name, distance: distance}", + "sizings": "sizings[*].{unit: unit, min: minimumSize, max: maximumSize}", + "brochures": "brochures", + } + results = {} + for key, path in parse_map.items(): + value = jmespath.search(path, data) + results[key] = value + return results + +# This function will find the PAGE_MODEL javascript variable and extract it +def extract_property(response: Response) -> dict: + """extract property data from rightmove PAGE_MODEL javascript variable""" + selector = Selector(response.text) + data = selector.xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get() + if not data: + print(f"page {response.url} is not a property listing page") + return + data = data.split("PAGE_MODEL = ", 1)[1].strip() + data = json.loads(data) + return data["propertyData"] + + +# this is our main scraping function that takes urls and returns the data +async def scrape_properties(urls: List[str]) -> List[dict]: + """Scrape Rightmove property listings for property data""" + to_scrape = [client.get(url) for url in urls] + properties = [] + for response in asyncio.as_completed(to_scrape): + response = await response + properties.append(parse_property(extract_property(response))) + return properties + +async def find_locations(query: str) -> List[str]: + """use rightmove's typeahead api to find location IDs. Returns list of + location IDs in most likely order""" + # rightmove uses two character long tokens so "cornwall" becomes "CO/RN/WA/LL" + tokenize_query = "".join(c + ("/" if i % 2 == 0 else "") for i, c in enumerate(query.upper(), start=1)) + url = f"https://www.rightmove.co.uk/typeAhead/uknostreet/{tokenize_query.strip('/')}/" + response = await client.get(url) + data = json.loads(response.text) + return [prediction["locationIdentifier"] for prediction in data["typeAheadLocations"]] + + +async def scrape_search(location_id: str) -> dict: + RESULTS_PER_PAGE = 24 + + def make_url(offset: int) -> str: + url = "https://www.rightmove.co.uk/api/_search?" + params = { + "areaSizeUnit": "sqft", + "channel": "RENT", # BUY or RENT + "currencyCode": "GBP", + "includeSSTC": "false", + "index": offset, # page offset + "isFetching": "false", + "locationIdentifier": location_id, #e.g.: "REGION^61294", + "numberOfPropertiesPerPage": RESULTS_PER_PAGE, + "radius": "0.0", + "sortType": "6", + "viewType": "LIST", + } + return url + urlencode(params) + first_page = await client.get(make_url(0)) + first_page_data = json.loads(first_page.content) + total_results = int(first_page_data['resultCount'].replace(',', '')) + results = first_page_data['properties'] + + other_pages = [] + # rightmove sets the API limit to 1000 properties + max_api_results = 1000 + for offset in range(RESULTS_PER_PAGE, total_results, RESULTS_PER_PAGE): + # stop scraping more pages when the scraper reach the API limit + if offset >= max_api_results: + break + other_pages.append(client.get(make_url(offset))) + for response in asyncio.as_completed(other_pages): + response = await response + data = json.loads(response.text) + results.extend(data['properties']) + return results + +async def run(): + # Change location to search other areas of the country. + location = 'manchester' + timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + fname = f'raw-data/{timestamp}_right-move-{location}.json' + location_id = (await find_locations(location))[0] + print(location_id) + location_results = await scrape_search(location_id) + with open(fname, 'w', encoding='utf-8') as f: + json.dump(location_results, f, ensure_ascii=False, indent=2) + +if __name__ == "__main__": + asyncio.run(run())