write rightmove.py script, for scraping data from Right Move site.

3 months ago · a968caab5d
1 changed files with 183 additions and 0 deletions
--- a/rightmove.py
+++ b/rightmove.py
@ -0,0 +1,183 @@
+import asyncio
+import json
+import datetime
+from typing import List, TypedDict
+from urllib.parse import urlencode
+from httpx import AsyncClient, Response
+from parsel import Selector
+
+class PropertyResult(TypedDict):
+    """this is what our result dataset will look like"""
+    id: str
+    available: bool
+    archived: bool
+    phone: str
+    bedrooms: int
+    bathrooms: int
+    type: str
+    property_type: str
+    tags: list
+    description: str
+    title: str
+    subtitle: str
+    price: str
+    price_sqft: str
+    address: dict
+    latitude: float
+    longitude: float
+    features: list
+    history: dict
+    photos: list
+    floorplans: list
+    agency: dict
+    industryAffiliations: list
+    nearest_airports: list
+    nearest_stations: list
+    sizings: list
+    brochures: list
+
+# 1. establish HTTP client with browser-like headers to avoid being blocked
+client = AsyncClient(
+    headers={
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
+        "Accept-Encoding": "gzip, deflate, br",
+        "Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6",
+    },
+    follow_redirects=True,
+    http2=True,  # enable http2 to reduce block chance
+    timeout=30,
+)
+
+# XXX: we'll fill this in later
+def parse_property(data) -> PropertyResult:
+    """parse rightmove cache data for proprety information"""
+    # here we define field name to JMESPath mapping
+    parse_map = {
+        "id": "id",
+        "available": "status.published",
+        "archived": "status.archived",
+        "phone": "contactInfo.telephoneNumbers.localNumber",
+        "bedrooms": "bedrooms",
+        "bathrooms": "bathrooms",
+        "type": "transactionType",
+        "property_type": "propertySubType",
+        "tags": "tags",
+        "description": "text.description",
+        "title": "text.pageTitle",
+        "subtitle": "text.propertyPhrase",
+        "price": "prices.primaryPrice",
+        "price_sqft": "prices.pricePerSqFt",
+        "address": "address",
+        "latitude": "location.latitude",
+        "longitude": "location.longitude",
+        "features": "keyFeatures",
+        "history": "listingHistory",
+        "photos": "images[*].{url: url, caption: caption}",
+        "floorplans": "floorplans[*].{url: url, caption: caption}",
+        "agency": """customer.{
+            id: branchId, 
+            branch: branchName, 
+            company: companyName, 
+            address: displayAddress, 
+            commercial: commercial, 
+            buildToRent: buildToRent,
+            isNew: isNewHomeDeveloper
+        }""",
+        "industryAffiliations": "industryAffiliations[*].name",
+        "nearest_airports": "nearestAirports[*].{name: name, distance: distance}",
+        "nearest_stations": "nearestStations[*].{name: name, distance: distance}",
+        "sizings": "sizings[*].{unit: unit, min: minimumSize, max: maximumSize}",
+        "brochures": "brochures",
+    }
+    results = {}
+    for key, path in parse_map.items():
+        value = jmespath.search(path, data)
+        results[key] = value
+    return results
+
+# This function will find the PAGE_MODEL javascript variable and extract it 
+def extract_property(response: Response) -> dict:
+    """extract property data from rightmove PAGE_MODEL javascript variable"""
+    selector = Selector(response.text)
+    data = selector.xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get()
+    if not data:
+        print(f"page {response.url} is not a property listing page")
+        return
+    data = data.split("PAGE_MODEL = ", 1)[1].strip()
+    data = json.loads(data)
+    return data["propertyData"]
+
+
+# this is our main scraping function that takes urls and returns the data
+async def scrape_properties(urls: List[str]) -> List[dict]:
+    """Scrape Rightmove property listings for property data"""
+    to_scrape = [client.get(url) for url in urls]
+    properties = []
+    for response in asyncio.as_completed(to_scrape):
+        response = await response
+        properties.append(parse_property(extract_property(response)))
+    return properties
+
+async def find_locations(query: str) -> List[str]:
+    """use rightmove's typeahead api to find location IDs. Returns list of
+    location IDs in most likely order"""
+    # rightmove uses two character long tokens so "cornwall" becomes "CO/RN/WA/LL"
+    tokenize_query = "".join(c + ("/" if i % 2 == 0 else "") for i, c in enumerate(query.upper(), start=1))
+    url = f"https://www.rightmove.co.uk/typeAhead/uknostreet/{tokenize_query.strip('/')}/"
+    response = await client.get(url)
+    data = json.loads(response.text)
+    return [prediction["locationIdentifier"] for prediction in data["typeAheadLocations"]]
+
+
+async def scrape_search(location_id: str) -> dict:
+    RESULTS_PER_PAGE = 24
+
+    def make_url(offset: int) -> str:
+        url = "https://www.rightmove.co.uk/api/_search?"
+        params = {
+            "areaSizeUnit": "sqft",
+            "channel": "RENT",  # BUY or RENT
+            "currencyCode": "GBP",
+            "includeSSTC": "false",
+            "index": offset,  # page offset
+            "isFetching": "false",
+            "locationIdentifier": location_id, #e.g.: "REGION^61294", 
+            "numberOfPropertiesPerPage": RESULTS_PER_PAGE,
+            "radius": "0.0",
+            "sortType": "6",
+            "viewType": "LIST",
+        }
+        return url + urlencode(params)
+    first_page = await client.get(make_url(0))
+    first_page_data = json.loads(first_page.content)
+    total_results = int(first_page_data['resultCount'].replace(',', ''))
+    results = first_page_data['properties']
+    
+    other_pages = []
+    # rightmove sets the API limit to 1000 properties
+    max_api_results = 1000    
+    for offset in range(RESULTS_PER_PAGE, total_results, RESULTS_PER_PAGE):
+        # stop scraping more pages when the scraper reach the API limit
+        if offset >= max_api_results:
+            break
+        other_pages.append(client.get(make_url(offset)))
+    for response in asyncio.as_completed(other_pages):
+        response = await response
+        data = json.loads(response.text)
+        results.extend(data['properties'])
+    return results
+
+async def run():
+    # Change location to search other areas of the country.
+    location = 'manchester'
+    timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+    fname = f'raw-data/{timestamp}_right-move-{location}.json'
+    location_id = (await find_locations(location))[0]
+    print(location_id)
+    location_results = await scrape_search(location_id)
+    with open(fname, 'w', encoding='utf-8') as f:
+        json.dump(location_results, f, ensure_ascii=False, indent=2)
+
+if __name__ == "__main__":
+    asyncio.run(run())