dev_scratch.Rmd

---
title: "Dev & Testing"
author: "Dr. Henry Lydecker"
date: "28/02/2023"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

library(forcats)
library(dplyr)
library(reshape2)
library(stringr)
library(ggplot2)
library(data.table)
library(lubridate)
library(zoo)
library(rvest)
library(stats)
library(tidyverse)
library(GGally)

```

```{r}
prop_config <- c(1, 1)    # target 1 bed 1 bath apartments
props_to_pull <- 25       # pages of apartments to loop through

state <- "NSW"
postcode <- "2088"
suburb <- "Mosman"

urls <- paste0('https://www.auhouseprices.com/rent/list/',
               state,
               "/",
               postcode,
               "/",
               suburb,
               "/",
               1:props_to_pull, '/?sort=date&type=apartment&bmin=',
               prop_config[1], '&bmax=', prop_config[1])
```


```{r}
rent_scrape <- function(state, postcode, suburb, beds, baths, pages) {
  # prepare config for the URL scraping
  # TODO: make this work with carspaces too
  prop_config <- c(beds, baths)
  # Number of pages to search through
  props_to_pull <- pages

  # Create a list of URLs to scrape through
  urls <- paste0(
    'https://www.auhouseprices.com/rent/list/',
    state,
    "/",
    postcode,
    "/",
    suburb,
    "/",
    1:props_to_pull,
    '/?sort=date&type=apartment&bmin=',
    prop_config[1],
    '&bmax=',
    prop_config[1]
  )

  # loop through URLs
  for (i in 1:length(urls)) {
    if (i == 1 & exists('rent_all'))
      rm(rent_all)

    curr_url <- urls[[i]]
    message(paste0('getting ', i))
    temp <- read_html(curr_url)

    # sleep between requests for 2 seconds so as not to bombard the server
    message('sleeping')
    Sys.sleep(2)

    address <- temp %>%
      html_nodes('h4') %>%
      html_text() %>%
      .[which(. != ' Search Filter and Sorting ')]

    price_month <- temp %>%
      html_nodes('li') %>%
      html_text() %>%
      str_extract('^Rent.+/week.*\\d{4}$') %>%
      .[which(!is.na(.))]

    config <- temp %>%
      html_nodes('li') %>%
      html_text() %>%
      str_extract(' \\d \\d \\d*[ ]*$') %>%
      .[which(!is.na(.))]

    combined <- data.table(address, price_month, config)

    # append results of this iteration to our master data set
    if (!exists('rent_all')) {
      rent_all <- combined
    } else {
      rent_all <- rbind(rent_all, combined)
    }
  }

  # extract month
  rent_all$month <- str_extract(rent_all$price_month, '[A-Z][a-z]{2} \\d{4}$')
  rent_all$month <- dmy(paste0('01 ', rent_all$month))

  # extract price
  rent_all$price <- str_extract(rent_all$price_month, '(?<=Rent \\$).*(?=/week)')
  rent_all$price <- as.numeric(rent_all$price)

  # remove any dups
  rent_all <- rent_all[!duplicated(rent_all)]

  # subset to view only those matching property configuration specified above
  pattern <- paste0(prop_config[[1]], '\\s', prop_config[[2]])

  # create our analytical dataset
  ads <- rent_all[grepl(pattern, rent_all$config), ]

  return(ads)
}
```


```{r}
rent_scrape <- function(state, postcode, suburb, beds, baths, pages) {
  # prepare config for the URL scraping
  # TODO: make this work with carspaces too
  prop_config <- c(beds, baths)
  # Number of pages to search through
  props_to_pull <- pages

  # Create a list of URLs to scrape through
  urls <- paste0(
    'https://www.auhouseprices.com/rent/list/',
    state, "/",
    postcode, "/",
    suburb, "/",
    1:props_to_pull,
    '/?sort=date&type=apartment&bmin=',
    prop_config[1],
    '&bmax=',
    prop_config[1]
  )

  # Use lapply() to scrape each URL in urls and combine the results into a single data table
  rent_all <- rbindlist(lapply(urls, function(url) {
    message(paste0('getting ', url))
    Sys.sleep(2)
    temp <- read_html(url)
    address <- temp %>%
      html_nodes('h4') %>%
      html_text() %>%
      .[which(. != ' Search Filter and Sorting ')]
    price_month <- temp %>%
      html_nodes('li') %>%
      html_text() %>%
      str_extract('^Rent.+/week.*\\d{4}$') %>%
      .[which(!is.na(.))]
    config <- temp %>%
      html_nodes('li') %>%
      html_text() %>%
      str_extract(' \\d \\d \\d*[ ]*$') %>%
      .[which(!is.na(.))]
    combined <- data.table(address, price_month, config)
    return(combined)
  }))

  # extract month
  rent_all$month <- str_extract(rent_all$price_month, '[A-Z][a-z]{2} \\d{4}$')
  rent_all$month <- dmy(paste0('01 ', rent_all$month))

  # extract price
  price <- str_extract(rent_all$price_month, '(?<=Rent \\$).*(?=/week)')
  rent_all$price <- as.numeric(ifelse(grepl("^\\d*\\.?\\d+$", price), price, NA))

  # remove any dups
  rent_all <- rent_all[!duplicated(rent_all)]

  # subset to view only those matching property configuration specified above
  pattern <- paste0(prop_config[[1]], '\\s', prop_config[[2]])

  # create our analytical dataset
  ads <- rent_all[grepl(pattern, rent_all$config), ]

  return(ads)
}
```


```{r}
rent_data <- rent_scrape(state = "NSW", suburb = "Mosman", postcode = "2088", beds = 1, baths = 1 ,pages = 100)
```

```{r}
rent_data %>% 
  data.table()
```


```{r}
# loop through URLs
for (i in 1:length(urls)) {

  if(i == 1 & exists('rent_all')) rm(rent_all)

  curr_url <- urls[[i]]
  message(paste0('getting ', i))
  temp <- read_html(curr_url)

  # sleep between requests for 2 seconds so as not to bombard the server
  message('sleeping')
  Sys.sleep(2)

  address <- temp %>%
    html_nodes('h4') %>%
    html_text() %>%
    .[which(. != ' Search Filter and Sorting ')]

  price_month <- temp %>%
    html_nodes('li') %>%
    html_text() %>%
    str_extract('^Rent.+/week.*\\d{4}$') %>%
    .[which(!is.na(.))]

  config <- temp %>%
    html_nodes('li') %>%
    html_text() %>%
    str_extract(' \\d \\d \\d*[ ]*$') %>%
    .[which(!is.na(.))]

  combined <- data.table(address, price_month, config)

  # append results of this iteration to our master data set
  if(!exists('rent_all')) {
    rent_all <- combined
  } else {
    rent_all <- rbind(rent_all, combined)
  }
}
```


```{r}
# extract month
rent_all$month <- str_extract(rent_all$price_month, '[A-Z][a-z]{2} \\d{4}$')
rent_all$month <- dmy(paste0('01 ', rent_all$month))

# extract price
rent_all$price <- str_extract(rent_all$price_month, '(?<=Rent \\$).*(?=/week)')
rent_all$price <- as.numeric(rent_all$price)

# remove any dups
rent_all <- rent_all[!duplicated(rent_all)]

# subset to view only those matching property configuration specified above
pattern <- paste0(prop_config[[1]], '\\s', prop_config[[2]])

# create our analytical dataset
ads <- rent_all[grepl(pattern, rent_all$config), ]
```


```{r}
p <- rent_data %>%
  drop_na() %>% 
  mutate(month = reorder(
    factor(format(month, '%b %Y')), as.numeric(interaction(month(month), year(month)))
  )) %>% 
  ggplot(aes(x = month, y = price)) +
  geom_boxplot() +
  geom_hline(yintercept = 500, color = "green", linetype = "dashed") +
  geom_jitter(alpha = 0.2) +
  theme_bw() +
  theme(panel.grid = element_blank()) +
  labs(
    x = 'Month rented',
    y = 'Weekly rent',
    title = paste0('Distribution of weekly rent in ',suburb),
    subtitle = 'August 2020 - November 2022'
  )

library(plotly)

ggplotly(p)
```


```{r}
p <- rent_data %>% 
  ggplot(aes(x=price)) +
  geom_histogram() + 
  geom_vline(xintercept = 500, linetype = "dashed", color = "dark green") +
  theme_bw() +
  theme(panel.grid = element_blank()) +
  labs(
    x = "Weekly Rent",
    y = "Number of Properties"
  )
ggplotly(p)
```


```{r}
library(highcharter)


hcboxplot(
  x = rent_data$price,
  var = rent_data$month,
  name = "Weekly rent",
  color = "#2980b9",
  outliers = FALSE
) %>%
  hc_chart(type = "column") %>%
  hc_title(text = "Weekly rent in 2088 Mosman, NSW: 1 bed 1 bath units") %>%
  hc_yAxis(title = list(text = "Height in metre")) %>%
  hc_plotOptions(scatter = list(jitter = list(x = .1, y = 0)))
```


```{r}
rent_data %>% 
  ggplot(aes(x=month, y = price))+
  geom_jitter()+
  geom_smooth() +
  theme_bw() +
  theme(panel.grid = element_blank())
```


```{r}
# what is the suburb median?
median(rent_data$price)

# smooth data using rolling quarterly median
monthly_medians <- rent_data %>%
  group_by(month) %>%
  summarise(median_price = median(price), min_price = min(price), max_price = max(price))

rol_median <- rollmedian(monthly_medians$median_price, 3, na.pad = TRUE,
                         align = 'right')

names(rol_median) <- monthly_medians$month

rol_median <- data.table(month = as.Date(names(rol_median)),
                         rol_median = rol_median)
rol_median <- rol_median[!is.na(rol_median), ]
```

```{r}
rol_median %>%
  ggplot(aes(x = month, y = rol_median)) +
  geom_bar(stat = 'identity', fill="#8A1A6C") +
  coord_cartesian(ylim = c(400, 600)) +
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
        panel.grid = element_blank()) +
  scale_x_date(date_labels = "%b %Y", date_breaks = '1 month') +
  labs(x = 'Month rented', y = 'Smoothed weekly rent',
       title = paste0('Distribution of weekly rent in ',suburb),
       subtitle = 'Smoothed by rolling quarterly median')
```


```{r}
# take a closer look at an address of interest. I used it to look at my apartment building
building <- ads %>%
  filter(grepl("", address, ignore.case = TRUE) # fill in the grepl

# extract the year from the date column
building$date <- as.Date(building$month, format = "%Y-%m-%d")
building$year <- year(building$date)

# Extract the unit and unit number from the address column
building$unit <- str_extract(building$address, "\\d+/\\d+")
building$unit_number <- str_extract(building$unit, "\\d+")

# what is the median unit price pw at my address
addressMedian <- median(building$price)
```