overhaul2024/rightmove.py

import asyncio
import json
import datetime
from typing import List, TypedDict
from urllib.parse import urlencode
from httpx import AsyncClient, Response
from parsel import Selector

class PropertyResult(TypedDict):
    """this is what our result dataset will look like"""
    id: str
    available: bool
    archived: bool
    phone: str
    bedrooms: int
    bathrooms: int
    type: str
    property_type: str
    tags: list
    description: str
    title: str
    subtitle: str
    price: str
    price_sqft: str
    address: dict
    latitude: float
    longitude: float
    features: list
    history: dict
    photos: list
    floorplans: list
    agency: dict
    industryAffiliations: list
    nearest_airports: list
    nearest_stations: list
    sizings: list
    brochures: list

# 1. establish HTTP client with browser-like headers to avoid being blocked
client = AsyncClient(
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6",
    },
    follow_redirects=True,
    http2=True,  # enable http2 to reduce block chance
    timeout=30,
)

# XXX: we'll fill this in later
def parse_property(data) -> PropertyResult:
    """parse rightmove cache data for proprety information"""
    # here we define field name to JMESPath mapping
    parse_map = {
        "id": "id",
        "available": "status.published",
        "archived": "status.archived",
        "phone": "contactInfo.telephoneNumbers.localNumber",
        "bedrooms": "bedrooms",
        "bathrooms": "bathrooms",
        "type": "transactionType",
        "property_type": "propertySubType",
        "tags": "tags",
        "description": "text.description",
        "title": "text.pageTitle",
        "subtitle": "text.propertyPhrase",
        "price": "prices.primaryPrice",
        "price_sqft": "prices.pricePerSqFt",
        "address": "address",
        "latitude": "location.latitude",
        "longitude": "location.longitude",
        "features": "keyFeatures",
        "history": "listingHistory",
        "photos": "images[*].{url: url, caption: caption}",
        "floorplans": "floorplans[*].{url: url, caption: caption}",
        "agency": """customer.{
            id: branchId, 
            branch: branchName, 
            company: companyName, 
            address: displayAddress, 
            commercial: commercial, 
            buildToRent: buildToRent,
            isNew: isNewHomeDeveloper
        }""",
        "industryAffiliations": "industryAffiliations[*].name",
        "nearest_airports": "nearestAirports[*].{name: name, distance: distance}",
        "nearest_stations": "nearestStations[*].{name: name, distance: distance}",
        "sizings": "sizings[*].{unit: unit, min: minimumSize, max: maximumSize}",
        "brochures": "brochures",
    }
    results = {}
    for key, path in parse_map.items():
        value = jmespath.search(path, data)
        results[key] = value
    return results

# This function will find the PAGE_MODEL javascript variable and extract it 
def extract_property(response: Response) -> dict:
    """extract property data from rightmove PAGE_MODEL javascript variable"""
    selector = Selector(response.text)
    data = selector.xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get()
    if not data:
        print(f"page {response.url} is not a property listing page")
        return
    data = data.split("PAGE_MODEL = ", 1)[1].strip()
    data = json.loads(data)
    return data["propertyData"]


# this is our main scraping function that takes urls and returns the data
async def scrape_properties(urls: List[str]) -> List[dict]:
    """Scrape Rightmove property listings for property data"""
    to_scrape = [client.get(url) for url in urls]
    properties = []
    for response in asyncio.as_completed(to_scrape):
        response = await response
        properties.append(parse_property(extract_property(response)))
    return properties

async def find_locations(query: str) -> List[str]:
    """use rightmove's typeahead api to find location IDs. Returns list of
    location IDs in most likely order"""
    # rightmove uses two character long tokens so "cornwall" becomes "CO/RN/WA/LL"
    tokenize_query = "".join(c + ("/" if i % 2 == 0 else "") for i, c in enumerate(query.upper(), start=1))
    url = f"https://www.rightmove.co.uk/typeAhead/uknostreet/{tokenize_query.strip('/')}/"
    response = await client.get(url)
    data = json.loads(response.text)
    return [prediction["locationIdentifier"] for prediction in data["typeAheadLocations"]]


async def scrape_search(location_id: str) -> dict:
    RESULTS_PER_PAGE = 24

    def make_url(offset: int) -> str:
        url = "https://www.rightmove.co.uk/api/_search?"
        params = {
            "areaSizeUnit": "sqft",
            "channel": "RENT",  # BUY or RENT
            "currencyCode": "GBP",
            "includeSSTC": "false",
            "index": offset,  # page offset
            "isFetching": "false",
            "locationIdentifier": location_id, #e.g.: "REGION^61294", 
            "numberOfPropertiesPerPage": RESULTS_PER_PAGE,
            "radius": "0.0",
            "sortType": "6",
            "viewType": "LIST",
        }
        return url + urlencode(params)
    first_page = await client.get(make_url(0))
    first_page_data = json.loads(first_page.content)
    total_results = int(first_page_data['resultCount'].replace(',', ''))
    results = first_page_data['properties']
    
    other_pages = []
    # rightmove sets the API limit to 1000 properties
    max_api_results = 1000    
    for offset in range(RESULTS_PER_PAGE, total_results, RESULTS_PER_PAGE):
        # stop scraping more pages when the scraper reach the API limit
        if offset >= max_api_results:
            break
        other_pages.append(client.get(make_url(offset)))
    for response in asyncio.as_completed(other_pages):
        response = await response
        data = json.loads(response.text)
        results.extend(data['properties'])
    return results

async def run():
    # Change location to search other areas of the country.
    location = 'manchester'
    timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    fname = f'raw-data/{timestamp}_right-move-{location}.json'
    location_id = (await find_locations(location))[0]
    print(location_id)
    location_results = await scrape_search(location_id)
    with open(fname, 'w', encoding='utf-8') as f:
        json.dump(location_results, f, ensure_ascii=False, indent=2)

if __name__ == "__main__":
    asyncio.run(run())
write rightmove.py script, for scraping data from Right Move site. 3 months ago			`import asyncio`
			`import json`
			`import datetime`
			`from typing import List, TypedDict`
			`from urllib.parse import urlencode`
			`from httpx import AsyncClient, Response`
			`from parsel import Selector`

			`class PropertyResult(TypedDict):`
			`"""this is what our result dataset will look like"""`
			`id: str`
			`available: bool`
			`archived: bool`
			`phone: str`
			`bedrooms: int`
			`bathrooms: int`
			`type: str`
			`property_type: str`
			`tags: list`
			`description: str`
			`title: str`
			`subtitle: str`
			`price: str`
			`price_sqft: str`
			`address: dict`
			`latitude: float`
			`longitude: float`
			`features: list`
			`history: dict`
			`photos: list`
			`floorplans: list`
			`agency: dict`
			`industryAffiliations: list`
			`nearest_airports: list`
			`nearest_stations: list`
			`sizings: list`
			`brochures: list`

			`# 1. establish HTTP client with browser-like headers to avoid being blocked`
			`client = AsyncClient(`
			`headers={`
			`"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",`
			`"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8",`
			`"Accept-Encoding": "gzip, deflate, br",`
			`"Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6",`
			`},`
			`follow_redirects=True,`
			`http2=True, # enable http2 to reduce block chance`
			`timeout=30,`
			`)`

			`# XXX: we'll fill this in later`
			`def parse_property(data) -> PropertyResult:`
			`"""parse rightmove cache data for proprety information"""`
			`# here we define field name to JMESPath mapping`
			`parse_map = {`
			`"id": "id",`
			`"available": "status.published",`
			`"archived": "status.archived",`
			`"phone": "contactInfo.telephoneNumbers.localNumber",`
			`"bedrooms": "bedrooms",`
			`"bathrooms": "bathrooms",`
			`"type": "transactionType",`
			`"property_type": "propertySubType",`
			`"tags": "tags",`
			`"description": "text.description",`
			`"title": "text.pageTitle",`
			`"subtitle": "text.propertyPhrase",`
			`"price": "prices.primaryPrice",`
			`"price_sqft": "prices.pricePerSqFt",`
			`"address": "address",`
			`"latitude": "location.latitude",`
			`"longitude": "location.longitude",`
			`"features": "keyFeatures",`
			`"history": "listingHistory",`
			`"photos": "images[*].{url: url, caption: caption}",`
			`"floorplans": "floorplans[*].{url: url, caption: caption}",`
			`"agency": """customer.{`
			`id: branchId,`
			`branch: branchName,`
			`company: companyName,`
			`address: displayAddress,`
			`commercial: commercial,`
			`buildToRent: buildToRent,`
			`isNew: isNewHomeDeveloper`
			`}""",`
			`"industryAffiliations": "industryAffiliations[*].name",`
			`"nearest_airports": "nearestAirports[*].{name: name, distance: distance}",`
			`"nearest_stations": "nearestStations[*].{name: name, distance: distance}",`
			`"sizings": "sizings[*].{unit: unit, min: minimumSize, max: maximumSize}",`
			`"brochures": "brochures",`
			`}`
			`results = {}`
			`for key, path in parse_map.items():`
			`value = jmespath.search(path, data)`
			`results[key] = value`
			`return results`

			`# This function will find the PAGE_MODEL javascript variable and extract it`
			`def extract_property(response: Response) -> dict:`
			`"""extract property data from rightmove PAGE_MODEL javascript variable"""`
			`selector = Selector(response.text)`
			`data = selector.xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get()`
			`if not data:`
			`print(f"page {response.url} is not a property listing page")`
			`return`
			`data = data.split("PAGE_MODEL = ", 1)[1].strip()`
			`data = json.loads(data)`
			`return data["propertyData"]`


			`# this is our main scraping function that takes urls and returns the data`
			`async def scrape_properties(urls: List[str]) -> List[dict]:`
			`"""Scrape Rightmove property listings for property data"""`
			`to_scrape = [client.get(url) for url in urls]`
			`properties = []`
			`for response in asyncio.as_completed(to_scrape):`
			`response = await response`
			`properties.append(parse_property(extract_property(response)))`
			`return properties`

			`async def find_locations(query: str) -> List[str]:`
			`"""use rightmove's typeahead api to find location IDs. Returns list of`
			`location IDs in most likely order"""`
			`# rightmove uses two character long tokens so "cornwall" becomes "CO/RN/WA/LL"`
			`tokenize_query = "".join(c + ("/" if i % 2 == 0 else "") for i, c in enumerate(query.upper(), start=1))`
			`url = f"https://www.rightmove.co.uk/typeAhead/uknostreet/{tokenize_query.strip('/')}/"`
			`response = await client.get(url)`
			`data = json.loads(response.text)`
			`return [prediction["locationIdentifier"] for prediction in data["typeAheadLocations"]]`


			`async def scrape_search(location_id: str) -> dict:`
			`RESULTS_PER_PAGE = 24`

			`def make_url(offset: int) -> str:`
			`url = "https://www.rightmove.co.uk/api/_search?"`
			`params = {`
			`"areaSizeUnit": "sqft",`
			`"channel": "RENT", # BUY or RENT`
			`"currencyCode": "GBP",`
			`"includeSSTC": "false",`
			`"index": offset, # page offset`
			`"isFetching": "false",`
			`"locationIdentifier": location_id, #e.g.: "REGION^61294",`
			`"numberOfPropertiesPerPage": RESULTS_PER_PAGE,`
			`"radius": "0.0",`
			`"sortType": "6",`
			`"viewType": "LIST",`
			`}`
			`return url + urlencode(params)`
			`first_page = await client.get(make_url(0))`
			`first_page_data = json.loads(first_page.content)`
			`total_results = int(first_page_data['resultCount'].replace(',', ''))`
			`results = first_page_data['properties']`

			`other_pages = []`
			`# rightmove sets the API limit to 1000 properties`
			`max_api_results = 1000`
			`for offset in range(RESULTS_PER_PAGE, total_results, RESULTS_PER_PAGE):`
			`# stop scraping more pages when the scraper reach the API limit`
			`if offset >= max_api_results:`
			`break`
			`other_pages.append(client.get(make_url(offset)))`
			`for response in asyncio.as_completed(other_pages):`
			`response = await response`
			`data = json.loads(response.text)`
			`results.extend(data['properties'])`
			`return results`

			`async def run():`
			`# Change location to search other areas of the country.`
			`location = 'manchester'`
			`timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')`
			`fname = f'raw-data/{timestamp}_right-move-{location}.json'`
			`location_id = (await find_locations(location))[0]`
			`print(location_id)`
			`location_results = await scrape_search(location_id)`
			`with open(fname, 'w', encoding='utf-8') as f:`
			`json.dump(location_results, f, ensure_ascii=False, indent=2)`

			`if __name__ == "__main__":`
			`asyncio.run(run())`