You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
183 lines
6.7 KiB
183 lines
6.7 KiB
import asyncio |
|
import json |
|
import datetime |
|
from typing import List, TypedDict |
|
from urllib.parse import urlencode |
|
from httpx import AsyncClient, Response |
|
from parsel import Selector |
|
|
|
class PropertyResult(TypedDict): |
|
"""this is what our result dataset will look like""" |
|
id: str |
|
available: bool |
|
archived: bool |
|
phone: str |
|
bedrooms: int |
|
bathrooms: int |
|
type: str |
|
property_type: str |
|
tags: list |
|
description: str |
|
title: str |
|
subtitle: str |
|
price: str |
|
price_sqft: str |
|
address: dict |
|
latitude: float |
|
longitude: float |
|
features: list |
|
history: dict |
|
photos: list |
|
floorplans: list |
|
agency: dict |
|
industryAffiliations: list |
|
nearest_airports: list |
|
nearest_stations: list |
|
sizings: list |
|
brochures: list |
|
|
|
# 1. establish HTTP client with browser-like headers to avoid being blocked |
|
client = AsyncClient( |
|
headers={ |
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", |
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", |
|
"Accept-Encoding": "gzip, deflate, br", |
|
"Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6", |
|
}, |
|
follow_redirects=True, |
|
http2=True, # enable http2 to reduce block chance |
|
timeout=30, |
|
) |
|
|
|
# XXX: we'll fill this in later |
|
def parse_property(data) -> PropertyResult: |
|
"""parse rightmove cache data for proprety information""" |
|
# here we define field name to JMESPath mapping |
|
parse_map = { |
|
"id": "id", |
|
"available": "status.published", |
|
"archived": "status.archived", |
|
"phone": "contactInfo.telephoneNumbers.localNumber", |
|
"bedrooms": "bedrooms", |
|
"bathrooms": "bathrooms", |
|
"type": "transactionType", |
|
"property_type": "propertySubType", |
|
"tags": "tags", |
|
"description": "text.description", |
|
"title": "text.pageTitle", |
|
"subtitle": "text.propertyPhrase", |
|
"price": "prices.primaryPrice", |
|
"price_sqft": "prices.pricePerSqFt", |
|
"address": "address", |
|
"latitude": "location.latitude", |
|
"longitude": "location.longitude", |
|
"features": "keyFeatures", |
|
"history": "listingHistory", |
|
"photos": "images[*].{url: url, caption: caption}", |
|
"floorplans": "floorplans[*].{url: url, caption: caption}", |
|
"agency": """customer.{ |
|
id: branchId, |
|
branch: branchName, |
|
company: companyName, |
|
address: displayAddress, |
|
commercial: commercial, |
|
buildToRent: buildToRent, |
|
isNew: isNewHomeDeveloper |
|
}""", |
|
"industryAffiliations": "industryAffiliations[*].name", |
|
"nearest_airports": "nearestAirports[*].{name: name, distance: distance}", |
|
"nearest_stations": "nearestStations[*].{name: name, distance: distance}", |
|
"sizings": "sizings[*].{unit: unit, min: minimumSize, max: maximumSize}", |
|
"brochures": "brochures", |
|
} |
|
results = {} |
|
for key, path in parse_map.items(): |
|
value = jmespath.search(path, data) |
|
results[key] = value |
|
return results |
|
|
|
# This function will find the PAGE_MODEL javascript variable and extract it |
|
def extract_property(response: Response) -> dict: |
|
"""extract property data from rightmove PAGE_MODEL javascript variable""" |
|
selector = Selector(response.text) |
|
data = selector.xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get() |
|
if not data: |
|
print(f"page {response.url} is not a property listing page") |
|
return |
|
data = data.split("PAGE_MODEL = ", 1)[1].strip() |
|
data = json.loads(data) |
|
return data["propertyData"] |
|
|
|
|
|
# this is our main scraping function that takes urls and returns the data |
|
async def scrape_properties(urls: List[str]) -> List[dict]: |
|
"""Scrape Rightmove property listings for property data""" |
|
to_scrape = [client.get(url) for url in urls] |
|
properties = [] |
|
for response in asyncio.as_completed(to_scrape): |
|
response = await response |
|
properties.append(parse_property(extract_property(response))) |
|
return properties |
|
|
|
async def find_locations(query: str) -> List[str]: |
|
"""use rightmove's typeahead api to find location IDs. Returns list of |
|
location IDs in most likely order""" |
|
# rightmove uses two character long tokens so "cornwall" becomes "CO/RN/WA/LL" |
|
tokenize_query = "".join(c + ("/" if i % 2 == 0 else "") for i, c in enumerate(query.upper(), start=1)) |
|
url = f"https://www.rightmove.co.uk/typeAhead/uknostreet/{tokenize_query.strip('/')}/" |
|
response = await client.get(url) |
|
data = json.loads(response.text) |
|
return [prediction["locationIdentifier"] for prediction in data["typeAheadLocations"]] |
|
|
|
|
|
async def scrape_search(location_id: str) -> dict: |
|
RESULTS_PER_PAGE = 24 |
|
|
|
def make_url(offset: int) -> str: |
|
url = "https://www.rightmove.co.uk/api/_search?" |
|
params = { |
|
"areaSizeUnit": "sqft", |
|
"channel": "RENT", # BUY or RENT |
|
"currencyCode": "GBP", |
|
"includeSSTC": "false", |
|
"index": offset, # page offset |
|
"isFetching": "false", |
|
"locationIdentifier": location_id, #e.g.: "REGION^61294", |
|
"numberOfPropertiesPerPage": RESULTS_PER_PAGE, |
|
"radius": "0.0", |
|
"sortType": "6", |
|
"viewType": "LIST", |
|
} |
|
return url + urlencode(params) |
|
first_page = await client.get(make_url(0)) |
|
first_page_data = json.loads(first_page.content) |
|
total_results = int(first_page_data['resultCount'].replace(',', '')) |
|
results = first_page_data['properties'] |
|
|
|
other_pages = [] |
|
# rightmove sets the API limit to 1000 properties |
|
max_api_results = 1000 |
|
for offset in range(RESULTS_PER_PAGE, total_results, RESULTS_PER_PAGE): |
|
# stop scraping more pages when the scraper reach the API limit |
|
if offset >= max_api_results: |
|
break |
|
other_pages.append(client.get(make_url(offset))) |
|
for response in asyncio.as_completed(other_pages): |
|
response = await response |
|
data = json.loads(response.text) |
|
results.extend(data['properties']) |
|
return results |
|
|
|
async def run(): |
|
# Change location to search other areas of the country. |
|
location = 'manchester' |
|
timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') |
|
fname = f'raw-data/{timestamp}_right-move-{location}.json' |
|
location_id = (await find_locations(location))[0] |
|
print(location_id) |
|
location_results = await scrape_search(location_id) |
|
with open(fname, 'w', encoding='utf-8') as f: |
|
json.dump(location_results, f, ensure_ascii=False, indent=2) |
|
|
|
if __name__ == "__main__": |
|
asyncio.run(run())
|
|
|