Craig Oates
3 months ago
1 changed files with 183 additions and 0 deletions
@ -0,0 +1,183 @@
|
||||
import asyncio |
||||
import json |
||||
import datetime |
||||
from typing import List, TypedDict |
||||
from urllib.parse import urlencode |
||||
from httpx import AsyncClient, Response |
||||
from parsel import Selector |
||||
|
||||
class PropertyResult(TypedDict): |
||||
"""this is what our result dataset will look like""" |
||||
id: str |
||||
available: bool |
||||
archived: bool |
||||
phone: str |
||||
bedrooms: int |
||||
bathrooms: int |
||||
type: str |
||||
property_type: str |
||||
tags: list |
||||
description: str |
||||
title: str |
||||
subtitle: str |
||||
price: str |
||||
price_sqft: str |
||||
address: dict |
||||
latitude: float |
||||
longitude: float |
||||
features: list |
||||
history: dict |
||||
photos: list |
||||
floorplans: list |
||||
agency: dict |
||||
industryAffiliations: list |
||||
nearest_airports: list |
||||
nearest_stations: list |
||||
sizings: list |
||||
brochures: list |
||||
|
||||
# 1. establish HTTP client with browser-like headers to avoid being blocked |
||||
client = AsyncClient( |
||||
headers={ |
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", |
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", |
||||
"Accept-Encoding": "gzip, deflate, br", |
||||
"Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6", |
||||
}, |
||||
follow_redirects=True, |
||||
http2=True, # enable http2 to reduce block chance |
||||
timeout=30, |
||||
) |
||||
|
||||
# XXX: we'll fill this in later |
||||
def parse_property(data) -> PropertyResult: |
||||
"""parse rightmove cache data for proprety information""" |
||||
# here we define field name to JMESPath mapping |
||||
parse_map = { |
||||
"id": "id", |
||||
"available": "status.published", |
||||
"archived": "status.archived", |
||||
"phone": "contactInfo.telephoneNumbers.localNumber", |
||||
"bedrooms": "bedrooms", |
||||
"bathrooms": "bathrooms", |
||||
"type": "transactionType", |
||||
"property_type": "propertySubType", |
||||
"tags": "tags", |
||||
"description": "text.description", |
||||
"title": "text.pageTitle", |
||||
"subtitle": "text.propertyPhrase", |
||||
"price": "prices.primaryPrice", |
||||
"price_sqft": "prices.pricePerSqFt", |
||||
"address": "address", |
||||
"latitude": "location.latitude", |
||||
"longitude": "location.longitude", |
||||
"features": "keyFeatures", |
||||
"history": "listingHistory", |
||||
"photos": "images[*].{url: url, caption: caption}", |
||||
"floorplans": "floorplans[*].{url: url, caption: caption}", |
||||
"agency": """customer.{ |
||||
id: branchId, |
||||
branch: branchName, |
||||
company: companyName, |
||||
address: displayAddress, |
||||
commercial: commercial, |
||||
buildToRent: buildToRent, |
||||
isNew: isNewHomeDeveloper |
||||
}""", |
||||
"industryAffiliations": "industryAffiliations[*].name", |
||||
"nearest_airports": "nearestAirports[*].{name: name, distance: distance}", |
||||
"nearest_stations": "nearestStations[*].{name: name, distance: distance}", |
||||
"sizings": "sizings[*].{unit: unit, min: minimumSize, max: maximumSize}", |
||||
"brochures": "brochures", |
||||
} |
||||
results = {} |
||||
for key, path in parse_map.items(): |
||||
value = jmespath.search(path, data) |
||||
results[key] = value |
||||
return results |
||||
|
||||
# This function will find the PAGE_MODEL javascript variable and extract it |
||||
def extract_property(response: Response) -> dict: |
||||
"""extract property data from rightmove PAGE_MODEL javascript variable""" |
||||
selector = Selector(response.text) |
||||
data = selector.xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get() |
||||
if not data: |
||||
print(f"page {response.url} is not a property listing page") |
||||
return |
||||
data = data.split("PAGE_MODEL = ", 1)[1].strip() |
||||
data = json.loads(data) |
||||
return data["propertyData"] |
||||
|
||||
|
||||
# this is our main scraping function that takes urls and returns the data |
||||
async def scrape_properties(urls: List[str]) -> List[dict]: |
||||
"""Scrape Rightmove property listings for property data""" |
||||
to_scrape = [client.get(url) for url in urls] |
||||
properties = [] |
||||
for response in asyncio.as_completed(to_scrape): |
||||
response = await response |
||||
properties.append(parse_property(extract_property(response))) |
||||
return properties |
||||
|
||||
async def find_locations(query: str) -> List[str]: |
||||
"""use rightmove's typeahead api to find location IDs. Returns list of |
||||
location IDs in most likely order""" |
||||
# rightmove uses two character long tokens so "cornwall" becomes "CO/RN/WA/LL" |
||||
tokenize_query = "".join(c + ("/" if i % 2 == 0 else "") for i, c in enumerate(query.upper(), start=1)) |
||||
url = f"https://www.rightmove.co.uk/typeAhead/uknostreet/{tokenize_query.strip('/')}/" |
||||
response = await client.get(url) |
||||
data = json.loads(response.text) |
||||
return [prediction["locationIdentifier"] for prediction in data["typeAheadLocations"]] |
||||
|
||||
|
||||
async def scrape_search(location_id: str) -> dict: |
||||
RESULTS_PER_PAGE = 24 |
||||
|
||||
def make_url(offset: int) -> str: |
||||
url = "https://www.rightmove.co.uk/api/_search?" |
||||
params = { |
||||
"areaSizeUnit": "sqft", |
||||
"channel": "RENT", # BUY or RENT |
||||
"currencyCode": "GBP", |
||||
"includeSSTC": "false", |
||||
"index": offset, # page offset |
||||
"isFetching": "false", |
||||
"locationIdentifier": location_id, #e.g.: "REGION^61294", |
||||
"numberOfPropertiesPerPage": RESULTS_PER_PAGE, |
||||
"radius": "0.0", |
||||
"sortType": "6", |
||||
"viewType": "LIST", |
||||
} |
||||
return url + urlencode(params) |
||||
first_page = await client.get(make_url(0)) |
||||
first_page_data = json.loads(first_page.content) |
||||
total_results = int(first_page_data['resultCount'].replace(',', '')) |
||||
results = first_page_data['properties'] |
||||
|
||||
other_pages = [] |
||||
# rightmove sets the API limit to 1000 properties |
||||
max_api_results = 1000 |
||||
for offset in range(RESULTS_PER_PAGE, total_results, RESULTS_PER_PAGE): |
||||
# stop scraping more pages when the scraper reach the API limit |
||||
if offset >= max_api_results: |
||||
break |
||||
other_pages.append(client.get(make_url(offset))) |
||||
for response in asyncio.as_completed(other_pages): |
||||
response = await response |
||||
data = json.loads(response.text) |
||||
results.extend(data['properties']) |
||||
return results |
||||
|
||||
async def run(): |
||||
# Change location to search other areas of the country. |
||||
location = 'manchester' |
||||
timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') |
||||
fname = f'raw-data/{timestamp}_right-move-{location}.json' |
||||
location_id = (await find_locations(location))[0] |
||||
print(location_id) |
||||
location_results = await scrape_search(location_id) |
||||
with open(fname, 'w', encoding='utf-8') as f: |
||||
json.dump(location_results, f, ensure_ascii=False, indent=2) |
||||
|
||||
if __name__ == "__main__": |
||||
asyncio.run(run()) |
Loading…
Reference in new issue