overhaul2024/rightmove.py

import asyncio
import json
import datetime
from typing import List, TypedDict
from urllib.parse import urlencode
from httpx import AsyncClient, Response
from parsel import Selector

class PropertyResult(TypedDict):
    """this is what our result dataset will look like"""
    id: str
    available: bool
    archived: bool
    phone: str
    bedrooms: int
    bathrooms: int
    type: str
    property_type: str
    tags: list
    description: str
    title: str
    subtitle: str
    price: str
    price_sqft: str
    address: dict
    latitude: float
    longitude: float
    features: list
    history: dict
    photos: list
    floorplans: list
    agency: dict
    industryAffiliations: list
    nearest_airports: list
    nearest_stations: list
    sizings: list
    brochures: list

# 1. establish HTTP client with browser-like headers to avoid being blocked
client = AsyncClient(
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6",
    },
    follow_redirects=True,
    http2=True,  # enable http2 to reduce block chance
    timeout=30,
)

# XXX: we'll fill this in later
def parse_property(data) -> PropertyResult:
    """parse rightmove cache data for proprety information"""
    # here we define field name to JMESPath mapping
    parse_map = {
        "id": "id",
        "available": "status.published",
        "archived": "status.archived",
        "phone": "contactInfo.telephoneNumbers.localNumber",
        "bedrooms": "bedrooms",
        "bathrooms": "bathrooms",
        "type": "transactionType",
        "property_type": "propertySubType",
        "tags": "tags",
        "description": "text.description",
        "title": "text.pageTitle",
        "subtitle": "text.propertyPhrase",
        "price": "prices.primaryPrice",
        "price_sqft": "prices.pricePerSqFt",
        "address": "address",
        "latitude": "location.latitude",
        "longitude": "location.longitude",
        "features": "keyFeatures",
        "history": "listingHistory",
        "photos": "images[*].{url: url, caption: caption}",
        "floorplans": "floorplans[*].{url: url, caption: caption}",
        "agency": """customer.{
            id: branchId,
            branch: branchName,
            company: companyName,
            address: displayAddress,
            commercial: commercial,
            buildToRent: buildToRent,
            isNew: isNewHomeDeveloper
        }""",
        "industryAffiliations": "industryAffiliations[*].name",
        "nearest_airports": "nearestAirports[*].{name: name, distance: distance}",
        "nearest_stations": "nearestStations[*].{name: name, distance: distance}",
        "sizings": "sizings[*].{unit: unit, min: minimumSize, max: maximumSize}",
        "brochures": "brochures",
    }
    results = {}
    for key, path in parse_map.items():
        value = jmespath.search(path, data)
        results[key] = value
    return results

# This function will find the PAGE_MODEL javascript variable and extract it
def extract_property(response: Response) -> dict:
    """extract property data from rightmove PAGE_MODEL javascript variable"""
    selector = Selector(response.text)
    data = selector.xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get()
    if not data:
        print(f"page {response.url} is not a property listing page")
        return
    data = data.split("PAGE_MODEL = ", 1)[1].strip()
    data = json.loads(data)
    return data["propertyData"]


# this is our main scraping function that takes urls and returns the data
async def scrape_properties(urls: List[str]) -> List[dict]:
    """Scrape Rightmove property listings for property data"""
    to_scrape = [client.get(url) for url in urls]
    properties = []
    for response in asyncio.as_completed(to_scrape):
        response = await response
        properties.append(parse_property(extract_property(response)))
    return properties

async def find_locations(query: str) -> List[str]:
    """use rightmove's typeahead api to find location IDs. Returns list of
    location IDs in most likely order"""
    # rightmove uses two character long tokens so "cornwall" becomes "CO/RN/WA/LL"
    tokenize_query = "".join(c + ("/" if i % 2 == 0 else "") for i, c in enumerate(query.upper(), start=1))
    url = f"https://www.rightmove.co.uk/typeAhead/uknostreet/{tokenize_query.strip('/')}/"
    response = await client.get(url)
    data = json.loads(response.text)
    return [prediction["locationIdentifier"] for prediction in data["typeAheadLocations"]]


async def scrape_search(location_id: str) -> dict:
    RESULTS_PER_PAGE = 24

    def make_url(offset: int) -> str:
        url = "https://www.rightmove.co.uk/api/_search?"
        params = {
            "areaSizeUnit": "sqft",
            "channel": "RENT",  # BUY or RENT
            "currencyCode": "GBP",
            "includeSSTC": "false",
            "index": offset,  # page offset
            "isFetching": "false",
            "locationIdentifier": location_id, #e.g.: "REGION^61294",
            "numberOfPropertiesPerPage": RESULTS_PER_PAGE,
            "radius": "0.0",
            "sortType": "6",
            "viewType": "LIST",
        }
        return url + urlencode(params)
    first_page = await client.get(make_url(0))
    first_page_data = json.loads(first_page.content)
    total_results = int(first_page_data['resultCount'].replace(',', ''))
    results = first_page_data['properties']

    other_pages = []
    # rightmove sets the API limit to 1000 properties
    max_api_results = 1000
    for offset in range(RESULTS_PER_PAGE, total_results, RESULTS_PER_PAGE):
        # stop scraping more pages when the scraper reach the API limit
        if offset >= max_api_results:
            break
        other_pages.append(client.get(make_url(offset)))
    for response in asyncio.as_completed(other_pages):
        response = await response
        data = json.loads(response.text)
        results.extend(data['properties'])
    return results

async def run():
    # Change location to search other areas of the country.
    location = 'manchester'
    timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    fname = f'raw-data/{timestamp}_right-move-{location}.json'
    location_id = (await find_locations(location))[0]
    print(location_id)
    location_results = await scrape_search(location_id)
    with open(fname, 'w', encoding='utf-8') as f:
        json.dump(location_results, f, ensure_ascii=False, indent=2)

if __name__ == "__main__":
    asyncio.run(run())