import asyncio import json import datetime from typing import List, TypedDict from urllib.parse import urlencode from httpx import AsyncClient, Response from parsel import Selector class PropertyResult(TypedDict): """this is what our result dataset will look like""" id: str available: bool archived: bool phone: str bedrooms: int bathrooms: int type: str property_type: str tags: list description: str title: str subtitle: str price: str price_sqft: str address: dict latitude: float longitude: float features: list history: dict photos: list floorplans: list agency: dict industryAffiliations: list nearest_airports: list nearest_stations: list sizings: list brochures: list # 1. establish HTTP client with browser-like headers to avoid being blocked client = AsyncClient( headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6", }, follow_redirects=True, http2=True, # enable http2 to reduce block chance timeout=30, ) # XXX: we'll fill this in later def parse_property(data) -> PropertyResult: """parse rightmove cache data for proprety information""" # here we define field name to JMESPath mapping parse_map = { "id": "id", "available": "status.published", "archived": "status.archived", "phone": "contactInfo.telephoneNumbers.localNumber", "bedrooms": "bedrooms", "bathrooms": "bathrooms", "type": "transactionType", "property_type": "propertySubType", "tags": "tags", "description": "text.description", "title": "text.pageTitle", "subtitle": "text.propertyPhrase", "price": "prices.primaryPrice", "price_sqft": "prices.pricePerSqFt", "address": "address", "latitude": "location.latitude", "longitude": "location.longitude", "features": "keyFeatures", "history": "listingHistory", "photos": "images[*].{url: url, caption: caption}", "floorplans": "floorplans[*].{url: url, caption: caption}", "agency": """customer.{ id: branchId, branch: branchName, company: companyName, address: displayAddress, commercial: commercial, buildToRent: buildToRent, isNew: isNewHomeDeveloper }""", "industryAffiliations": "industryAffiliations[*].name", "nearest_airports": "nearestAirports[*].{name: name, distance: distance}", "nearest_stations": "nearestStations[*].{name: name, distance: distance}", "sizings": "sizings[*].{unit: unit, min: minimumSize, max: maximumSize}", "brochures": "brochures", } results = {} for key, path in parse_map.items(): value = jmespath.search(path, data) results[key] = value return results # This function will find the PAGE_MODEL javascript variable and extract it def extract_property(response: Response) -> dict: """extract property data from rightmove PAGE_MODEL javascript variable""" selector = Selector(response.text) data = selector.xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get() if not data: print(f"page {response.url} is not a property listing page") return data = data.split("PAGE_MODEL = ", 1)[1].strip() data = json.loads(data) return data["propertyData"] # this is our main scraping function that takes urls and returns the data async def scrape_properties(urls: List[str]) -> List[dict]: """Scrape Rightmove property listings for property data""" to_scrape = [client.get(url) for url in urls] properties = [] for response in asyncio.as_completed(to_scrape): response = await response properties.append(parse_property(extract_property(response))) return properties async def find_locations(query: str) -> List[str]: """use rightmove's typeahead api to find location IDs. Returns list of location IDs in most likely order""" # rightmove uses two character long tokens so "cornwall" becomes "CO/RN/WA/LL" tokenize_query = "".join(c + ("/" if i % 2 == 0 else "") for i, c in enumerate(query.upper(), start=1)) url = f"https://www.rightmove.co.uk/typeAhead/uknostreet/{tokenize_query.strip('/')}/" response = await client.get(url) data = json.loads(response.text) return [prediction["locationIdentifier"] for prediction in data["typeAheadLocations"]] async def scrape_search(location_id: str) -> dict: RESULTS_PER_PAGE = 24 def make_url(offset: int) -> str: url = "https://www.rightmove.co.uk/api/_search?" params = { "areaSizeUnit": "sqft", "channel": "RENT", # BUY or RENT "currencyCode": "GBP", "includeSSTC": "false", "index": offset, # page offset "isFetching": "false", "locationIdentifier": location_id, #e.g.: "REGION^61294", "numberOfPropertiesPerPage": RESULTS_PER_PAGE, "radius": "0.0", "sortType": "6", "viewType": "LIST", } return url + urlencode(params) first_page = await client.get(make_url(0)) first_page_data = json.loads(first_page.content) total_results = int(first_page_data['resultCount'].replace(',', '')) results = first_page_data['properties'] other_pages = [] # rightmove sets the API limit to 1000 properties max_api_results = 1000 for offset in range(RESULTS_PER_PAGE, total_results, RESULTS_PER_PAGE): # stop scraping more pages when the scraper reach the API limit if offset >= max_api_results: break other_pages.append(client.get(make_url(offset))) for response in asyncio.as_completed(other_pages): response = await response data = json.loads(response.text) results.extend(data['properties']) return results async def run(): # Change location to search other areas of the country. location = 'manchester' timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') fname = f'raw-data/{timestamp}_right-move-{location}.json' location_id = (await find_locations(location))[0] print(location_id) location_results = await scrape_search(location_id) with open(fname, 'w', encoding='utf-8') as f: json.dump(location_results, f, ensure_ascii=False, indent=2) if __name__ == "__main__": asyncio.run(run())