You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
184 lines
6.7 KiB
184 lines
6.7 KiB
3 months ago
|
import asyncio
|
||
|
import json
|
||
|
import datetime
|
||
|
from typing import List, TypedDict
|
||
|
from urllib.parse import urlencode
|
||
|
from httpx import AsyncClient, Response
|
||
|
from parsel import Selector
|
||
|
|
||
|
class PropertyResult(TypedDict):
|
||
|
"""this is what our result dataset will look like"""
|
||
|
id: str
|
||
|
available: bool
|
||
|
archived: bool
|
||
|
phone: str
|
||
|
bedrooms: int
|
||
|
bathrooms: int
|
||
|
type: str
|
||
|
property_type: str
|
||
|
tags: list
|
||
|
description: str
|
||
|
title: str
|
||
|
subtitle: str
|
||
|
price: str
|
||
|
price_sqft: str
|
||
|
address: dict
|
||
|
latitude: float
|
||
|
longitude: float
|
||
|
features: list
|
||
|
history: dict
|
||
|
photos: list
|
||
|
floorplans: list
|
||
|
agency: dict
|
||
|
industryAffiliations: list
|
||
|
nearest_airports: list
|
||
|
nearest_stations: list
|
||
|
sizings: list
|
||
|
brochures: list
|
||
|
|
||
|
# 1. establish HTTP client with browser-like headers to avoid being blocked
|
||
|
client = AsyncClient(
|
||
|
headers={
|
||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
|
||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
||
|
"Accept-Encoding": "gzip, deflate, br",
|
||
|
"Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6",
|
||
|
},
|
||
|
follow_redirects=True,
|
||
|
http2=True, # enable http2 to reduce block chance
|
||
|
timeout=30,
|
||
|
)
|
||
|
|
||
|
# XXX: we'll fill this in later
|
||
|
def parse_property(data) -> PropertyResult:
|
||
|
"""parse rightmove cache data for proprety information"""
|
||
|
# here we define field name to JMESPath mapping
|
||
|
parse_map = {
|
||
|
"id": "id",
|
||
|
"available": "status.published",
|
||
|
"archived": "status.archived",
|
||
|
"phone": "contactInfo.telephoneNumbers.localNumber",
|
||
|
"bedrooms": "bedrooms",
|
||
|
"bathrooms": "bathrooms",
|
||
|
"type": "transactionType",
|
||
|
"property_type": "propertySubType",
|
||
|
"tags": "tags",
|
||
|
"description": "text.description",
|
||
|
"title": "text.pageTitle",
|
||
|
"subtitle": "text.propertyPhrase",
|
||
|
"price": "prices.primaryPrice",
|
||
|
"price_sqft": "prices.pricePerSqFt",
|
||
|
"address": "address",
|
||
|
"latitude": "location.latitude",
|
||
|
"longitude": "location.longitude",
|
||
|
"features": "keyFeatures",
|
||
|
"history": "listingHistory",
|
||
|
"photos": "images[*].{url: url, caption: caption}",
|
||
|
"floorplans": "floorplans[*].{url: url, caption: caption}",
|
||
|
"agency": """customer.{
|
||
|
id: branchId,
|
||
|
branch: branchName,
|
||
|
company: companyName,
|
||
|
address: displayAddress,
|
||
|
commercial: commercial,
|
||
|
buildToRent: buildToRent,
|
||
|
isNew: isNewHomeDeveloper
|
||
|
}""",
|
||
|
"industryAffiliations": "industryAffiliations[*].name",
|
||
|
"nearest_airports": "nearestAirports[*].{name: name, distance: distance}",
|
||
|
"nearest_stations": "nearestStations[*].{name: name, distance: distance}",
|
||
|
"sizings": "sizings[*].{unit: unit, min: minimumSize, max: maximumSize}",
|
||
|
"brochures": "brochures",
|
||
|
}
|
||
|
results = {}
|
||
|
for key, path in parse_map.items():
|
||
|
value = jmespath.search(path, data)
|
||
|
results[key] = value
|
||
|
return results
|
||
|
|
||
|
# This function will find the PAGE_MODEL javascript variable and extract it
|
||
|
def extract_property(response: Response) -> dict:
|
||
|
"""extract property data from rightmove PAGE_MODEL javascript variable"""
|
||
|
selector = Selector(response.text)
|
||
|
data = selector.xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get()
|
||
|
if not data:
|
||
|
print(f"page {response.url} is not a property listing page")
|
||
|
return
|
||
|
data = data.split("PAGE_MODEL = ", 1)[1].strip()
|
||
|
data = json.loads(data)
|
||
|
return data["propertyData"]
|
||
|
|
||
|
|
||
|
# this is our main scraping function that takes urls and returns the data
|
||
|
async def scrape_properties(urls: List[str]) -> List[dict]:
|
||
|
"""Scrape Rightmove property listings for property data"""
|
||
|
to_scrape = [client.get(url) for url in urls]
|
||
|
properties = []
|
||
|
for response in asyncio.as_completed(to_scrape):
|
||
|
response = await response
|
||
|
properties.append(parse_property(extract_property(response)))
|
||
|
return properties
|
||
|
|
||
|
async def find_locations(query: str) -> List[str]:
|
||
|
"""use rightmove's typeahead api to find location IDs. Returns list of
|
||
|
location IDs in most likely order"""
|
||
|
# rightmove uses two character long tokens so "cornwall" becomes "CO/RN/WA/LL"
|
||
|
tokenize_query = "".join(c + ("/" if i % 2 == 0 else "") for i, c in enumerate(query.upper(), start=1))
|
||
|
url = f"https://www.rightmove.co.uk/typeAhead/uknostreet/{tokenize_query.strip('/')}/"
|
||
|
response = await client.get(url)
|
||
|
data = json.loads(response.text)
|
||
|
return [prediction["locationIdentifier"] for prediction in data["typeAheadLocations"]]
|
||
|
|
||
|
|
||
|
async def scrape_search(location_id: str) -> dict:
|
||
|
RESULTS_PER_PAGE = 24
|
||
|
|
||
|
def make_url(offset: int) -> str:
|
||
|
url = "https://www.rightmove.co.uk/api/_search?"
|
||
|
params = {
|
||
|
"areaSizeUnit": "sqft",
|
||
|
"channel": "RENT", # BUY or RENT
|
||
|
"currencyCode": "GBP",
|
||
|
"includeSSTC": "false",
|
||
|
"index": offset, # page offset
|
||
|
"isFetching": "false",
|
||
|
"locationIdentifier": location_id, #e.g.: "REGION^61294",
|
||
|
"numberOfPropertiesPerPage": RESULTS_PER_PAGE,
|
||
|
"radius": "0.0",
|
||
|
"sortType": "6",
|
||
|
"viewType": "LIST",
|
||
|
}
|
||
|
return url + urlencode(params)
|
||
|
first_page = await client.get(make_url(0))
|
||
|
first_page_data = json.loads(first_page.content)
|
||
|
total_results = int(first_page_data['resultCount'].replace(',', ''))
|
||
|
results = first_page_data['properties']
|
||
|
|
||
|
other_pages = []
|
||
|
# rightmove sets the API limit to 1000 properties
|
||
|
max_api_results = 1000
|
||
|
for offset in range(RESULTS_PER_PAGE, total_results, RESULTS_PER_PAGE):
|
||
|
# stop scraping more pages when the scraper reach the API limit
|
||
|
if offset >= max_api_results:
|
||
|
break
|
||
|
other_pages.append(client.get(make_url(offset)))
|
||
|
for response in asyncio.as_completed(other_pages):
|
||
|
response = await response
|
||
|
data = json.loads(response.text)
|
||
|
results.extend(data['properties'])
|
||
|
return results
|
||
|
|
||
|
async def run():
|
||
|
# Change location to search other areas of the country.
|
||
|
location = 'manchester'
|
||
|
timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
|
||
|
fname = f'raw-data/{timestamp}_right-move-{location}.json'
|
||
|
location_id = (await find_locations(location))[0]
|
||
|
print(location_id)
|
||
|
location_results = await scrape_search(location_id)
|
||
|
with open(fname, 'w', encoding='utf-8') as f:
|
||
|
json.dump(location_results, f, ensure_ascii=False, indent=2)
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
asyncio.run(run())
|