|
|
|
#+options: ':nil *:t -:t ::t <:t H:3 \n:nil ^:t arch:headline author:t
|
|
|
|
#+options: broken-links:nil c:nil creator:nil d:(not "LOGBOOK") date:t e:t
|
|
|
|
#+options: email:nil f:t inline:t num:t p:nil pri:nil prop:nil stat:t tags:t
|
|
|
|
#+options: tasks:t tex:t timestamp:t title:t toc:t todo:t |:t
|
|
|
|
#+title: Spare Room Manchester
|
|
|
|
#+date: today
|
|
|
|
#+author: Craig Oates
|
|
|
|
#+email: craig@craigoates.net
|
|
|
|
#+language: en
|
|
|
|
#+select_tags: export
|
|
|
|
#+exclude_tags: noexport
|
|
|
|
#+creator: Emacs 28.2 (Org mode 9.5.5)
|
|
|
|
#+cite_export:
|
|
|
|
|
|
|
|
* Setup Common Lisp Environment
|
|
|
|
|
|
|
|
You will not to execute this code block if you've already set up SLIME in
|
|
|
|
another ORG file. This is just in case this is the only file you're working on
|
|
|
|
today, or it's your first file of the day.
|
|
|
|
|
|
|
|
*Run ~m-x slime~ before running the following code.* And, make note of the
|
|
|
|
~:session~ attribute. It allows you to use the code in the code block to be use
|
|
|
|
in other code blocks which also use the ~:session~ attribute.
|
|
|
|
|
|
|
|
#+begin_src lisp :session :results silent
|
|
|
|
(ql:quickload :com.inuoe.jzon) ; JSON parser.
|
|
|
|
(ql:quickload :dexador) ; HTTP requests.
|
|
|
|
(ql:quickload :plump) ; HTML/XML parser.
|
|
|
|
(ql:quickload :lquery) ; HTML/DOM manipulation.
|
|
|
|
(ql:quickload :lparallel) ; Parallel programming.
|
|
|
|
(ql:quickload :cl-ppcre) ; RegEx. library.
|
|
|
|
(ql:quickload :plot/vega) ; Vega plotting library.
|
|
|
|
(ql:quickload :lisp-stat) ; Stat's library.
|
|
|
|
(ql:quickload :data-frame) ; Data frame library eqv. to Python's Numpy.
|
|
|
|
(ql:quickload :str) ; String library, expands on 'string' library.
|
|
|
|
#+end_src
|
|
|
|
|
|
|
|
* Gather Spare Room Data
|
|
|
|
|
|
|
|
Having done a quick manual search on [[https://www.spareroom.co.uk][Spare Room]], using =Manchester= as the
|
|
|
|
location/search term, the results page say there are =Showing 1-10 of 1000+
|
|
|
|
results=. So the total page count is,
|
|
|
|
|
|
|
|
#+begin_src calc :results raw
|
|
|
|
1000 / 10
|
|
|
|
#+end_src
|
|
|
|
|
|
|
|
#+RESULTS:
|
|
|
|
100
|
|
|
|
|
|
|
|
Because I’m going to grabbing that many pages, it’ll be better if I do this once
|
|
|
|
and save the raw HTML on my computer. That way, I can parse and sift through the
|
|
|
|
data without putting needless stress on the Spare Room servers – by repeatedly
|
|
|
|
scraping and parsing the data as I work out the bugs.
|
|
|
|
|
|
|
|
#+begin_src shell :results silent
|
|
|
|
# If you don't already have this directory. It includes files ignored by Git.
|
|
|
|
mkdir "raw-data/external"
|
|
|
|
#+end_src
|
|
|
|
|
|
|
|
#+begin_src shell
|
|
|
|
cd raw-data/external
|
|
|
|
DIRECTORY="$(date '+%Y-%m-%d')_spare-room-manc"
|
|
|
|
mkdir $DIRECTORY
|
|
|
|
for OFFSET in {0..990..10}
|
|
|
|
do
|
|
|
|
curl -o "$DIRECTORY/spare-room-manc-$OFFSET.html" \
|
|
|
|
"https://www.spareroom.co.uk/flatshare/index.cgi?offset=$OFFSET&search_id=1281027880&sort_by=price_low_to_high&mode=list"
|
|
|
|
sleep 5
|
|
|
|
done
|
|
|
|
# Change back to the project's root directory, so I don't call code whilst still
|
|
|
|
# in this directory.
|
|
|
|
cd ../../
|
|
|
|
#+end_src
|
|
|
|
|
|
|
|
* Clean Up and Parse Data
|
|
|
|
|
|
|
|
** Failed Attempt (Parse Whole Page)
|
|
|
|
|
|
|
|
This is my initial attempt to parse the HTML files I gathered from Spare
|
|
|
|
Room's website.
|
|
|
|
|
|
|
|
#+begin_src lisp :results silent
|
|
|
|
;; This is here for future reference, I don't recommend you run it.
|
|
|
|
(loop for file-path
|
|
|
|
in (directory #P"raw-data/external/2024-02-23_spare-room-manc/spare-room-manc-0.html")
|
|
|
|
do
|
|
|
|
(with-open-file (stream file-path)
|
|
|
|
(let* ((doc (plump:parse stream))
|
|
|
|
(locations (lquery:$ doc ".listingLocation"))
|
|
|
|
(prices (lquery:$ doc ".listingPrice"))
|
|
|
|
(data (loop for loc across locations
|
|
|
|
for price across prices
|
|
|
|
collect (list (plump:text loc) (plump:text price)))))
|
|
|
|
(with-open-file (stream
|
|
|
|
#P"working-data/2024-02-23-spare-room-manc.csv"
|
|
|
|
:direction :output
|
|
|
|
:if-exists :supersede)
|
|
|
|
(format stream "Location,Price~%")
|
|
|
|
(dolist (item data)
|
|
|
|
(format stream "~a,~a~%" (first item) (second item)))))))
|
|
|
|
#+end_src
|
|
|
|
|
|
|
|
Unfortunately, this approach produces inconsistent results. There are too many
|
|
|
|
lines of text with excessive spacing. There are newlines with no rhyme or reason
|
|
|
|
keep breaking the formatting, also. Having had a play around with the data, I
|
|
|
|
don't feel confident each entry has the rent price in the same place. So, I
|
|
|
|
think there is a chance the code which processes the files will produce
|
|
|
|
misaligned data. In other words, the rent for one location might be placed next
|
|
|
|
to the location which came before it in the processing pipeline (i.e. an
|
|
|
|
/off-by-one/ bug). Because of this, I think a more secure approach would be to
|
|
|
|
extract each listing into its own file and process one file at a time. This will,
|
|
|
|
hopefully, make it easier to identify if a listing has missing or ill-formed
|
|
|
|
pricing, location etc. data.
|
|
|
|
|
|
|
|
** Separate Each Listing into Their Own File
|
|
|
|
|
|
|
|
#+begin_src shell
|
|
|
|
mkdir "raw-data/external/2024-02-23_spare-room-manc-listings/"
|
|
|
|
#+end_src
|
|
|
|
|
|
|
|
#+begin_src lisp :results silent
|
|
|
|
(let ((counter 0))
|
|
|
|
(loop for file-path
|
|
|
|
in (directory #P"raw-data/external/2024-02-23_spare-room-manc/*.html")
|
|
|
|
do (with-open-file (in-stream file-path)
|
|
|
|
(let* ((doc (plump:parse in-stream))
|
|
|
|
(listings (lquery:$ doc ".listing-result" (serialize))))
|
|
|
|
(loop for item across listings
|
|
|
|
do (let ((out-path
|
|
|
|
(merge-pathnames "raw-data/external/2024-02-23_spare-room-manc-listings/"
|
|
|
|
(format nil "listing-~a.html" (write-to-string counter)))))
|
|
|
|
(with-open-file (out-stream
|
|
|
|
out-path
|
|
|
|
:direction :output
|
|
|
|
:if-exists :supersede)
|
|
|
|
(format out-stream "~a" item))
|
|
|
|
(incf counter)))))))
|
|
|
|
#+end_src
|
|
|
|
|
|
|
|
** Create CSV of Listings
|
|
|
|
|
|
|
|
#+begin_src lisp :results silent
|
|
|
|
(with-open-file (out-stream
|
|
|
|
#P"working-data/2024-02-23-spare-room-manc.csv"
|
|
|
|
:direction :output
|
|
|
|
:if-exists :supersede)
|
|
|
|
(format out-stream "ROW-ID, LISTING-ID, LOCATION, POSTCODE, RAW-PRICE, PRICE, FREQUENCY, PRICE-DETAILS, URL~%")
|
|
|
|
(let ((row-id 0))
|
|
|
|
(loop for file-path
|
|
|
|
in (directory #P"raw-data/external/2024-02-23_spare-room-manc-listings/*.html")
|
|
|
|
do (with-open-file (in-stream file-path)
|
|
|
|
(let* ((doc (plump:parse in-stream))
|
|
|
|
(listing-id (lquery:$ doc ".listing-result" (attr "data-listing-id")))
|
|
|
|
(price-details (lquery:$ doc ".listingPriceDetails" (text)))
|
|
|
|
(location (lquery:$ doc ".listingLocation" (text)))
|
|
|
|
(url (lquery:$ doc "article" "a" (attr "href")))
|
|
|
|
(postcode (lquery:$ doc ".listing-result" (attr "data-listing-postcode")))
|
|
|
|
(price (lquery:$ doc ".listingPrice" (text)))
|
|
|
|
(cleaned-price (if (str:contains? "," (aref price 0))
|
|
|
|
(str:replace-all "," "" (aref price 0))
|
|
|
|
(aref price 0))))
|
|
|
|
(format out-stream "~a,~a,~a,~a,~a,~a,~a,~a,~a"
|
|
|
|
(write-to-string row-id)
|
|
|
|
(aref listing-id 0)
|
|
|
|
(aref location 0)
|
|
|
|
(aref postcode 0)
|
|
|
|
cleaned-price
|
|
|
|
(first (cl-ppcre:all-matches-as-strings "\\d+" cleaned-price))
|
|
|
|
(first (cl-ppcre:all-matches-as-strings "(pw|pcm)" cleaned-price))
|
|
|
|
(aref price-details 0)
|
|
|
|
(format nil "https//www.spareroom.co.uk~a~%" (aref url 0)))
|
|
|
|
(incf row-id))))))
|
|
|
|
#+end_src
|