library(tidyverse)
library(dplyr)     # Data manipulation
library(ggplot2)   # Visualization
library(caret)     # Confusion matrix
library(pscl)    # For pseudo R-squared measures
library(stargazer) # Model summary tables
library(knitr) 
library(readr)
library(xgboost)
library(randomForest)
library(GPfit)
library(tidymodels)
library(caret)

#####
# Failed attempt at parsing data # 1
#dataset1 <- read_csv("Data/snap_mm2w1ese14tkry9w25.1.csv")
#dataset2 <- read_csv("Data/snap_mm2w1ese14tkry9w25.2.csv")
#dataset3 <- read_csv("Data/snap_mm2w1ese14tkry9w25.3.csv")



#dataset1 <- read_csv(
#  "Data/snap_mm2w1ese14tkry9w25.1.csv",
#  col_types = cols(.default = col_character())
#)

#dataset2 <- read_csv(
#  "Data/snap_mm2w1ese14tkry9w25.2.csv",
#  col_types = cols(.default = col_character())
#)

#dataset3 <- read_csv(
#  "Data/snap_mm2w1ese14tkry9w25.3.csv",
#  col_types = cols(.default = col_character())
#)

#colnames(dataG)[c(81,96,120)]

#Nuclear Option: Turn to all chars
#dataset1 <- read_csv("Data/snap_mm2w1ese14tkry9w25.1.csv",
#                     col_types = cols(.default = col_character()))

#full_data <- bind_rows(dataset1, dataset2, dataset3)


#####
# Failed attempt number 2
#ALL ONE GO
#files <- list.files("Data", pattern = "snap_mm2w1ese14tkry9w25", full.names = TRUE)

library(purrr)
library(tidyverse)
library(dplyr)
library(readr)
library(lubridate)
library(data.table)

#data_fread1 <- fread("Data/snap_mm2w1ese14tkry9w25.1.csv")
#data_fread2 <- fread("Data/snap_mm2w1ese14tkry9w25.2.csv")
#data_fread3 <- fread("Data/snap_mm2w1ese14tkry9w25.3.csv")

#####
# Real Parsing method
# If you want to run this I would HIGHLY recommend 
# Dropping description ASAP
library(data.table)
library(purrr)
library(tidyverse)
library(dplyr)
library(readr)
library(lubridate)


# Get file list automatically
files <- list.files("Data", full.names = TRUE)

# Read + bind in one step (fastest way)
DT <- rbindlist(
  lapply(files, fread),
  use.names = TRUE,
  fill = TRUE
)

# Remove unwanted columns EARLY 
DT[, c(
  "isCurrentSignedInUserVerifiedOwner",
  "isVerifiedClaimedByCurrentSignedInUser",
  "hasBadGeocode",
  "streetViewMetadataUrlMediaWallLatLong",
  "streetViewMetadataUrlMediaWallAddress",
  "streetViewServiceUrl",
  "listingDataSource",
  "isUndisclosedAddress",
  "photoCount",
  "has_3d_tour",
  "isPremierBuilder",
  "isCurrentSignedInAgentResponsible",
  "isListingClaimedBySignedInUser",
  "isZillowOwned",
  "hdpUrl",
  "tourViewCount",
  "hasPublicVideo",
  "hasApprovedThirdPartyVirtualTourUrl",
  "streetViewTileImageUrlMediumLatLong",
  "isNonOwnerOccupied"
) := NULL]

# Convert types by REFERENCE 

DT[, `:=`(
  bathrooms = as.integer(bathrooms),
  bedrooms = as.integer(bedrooms),
  yearBuilt = as.integer(yearBuilt),
  taxAssessedYear = as.integer(taxAssessedYear),
  daysOnZillow = as.integer(daysOnZillow),
  zpid = as.numeric(zpid),  # safer than integer
  longitude = as.numeric(longitude),
  latitude = as.numeric(latitude),
  lotAreaValue = as.numeric(lotAreaValue),
  propertyTaxRate = as.numeric(propertyTaxRate),
  sold_to_list_ratio = as.numeric(sold_to_list_ratio),
  sold_to_zestimate_ratio = as.numeric(sold_to_zestimate_ratio)
)]

# Fast numeric parsing (remove $ and commas without readr)
cols_money <- c("price", "zestimate", "rentZestimate", "taxAssessedValue")

DT[, (cols_money) := lapply(.SD, function(x)
  as.numeric(gsub("[^0-9.]", "", x))
), .SDcols = cols_money]

# Logic conversion
logical_cols <- c("isFeatured", "isOffMarket", "is_flipped_within_12_months")

DT[, (logical_cols) := lapply(.SD, function(x) x == "true"),
   .SDcols = logical_cols]

# Date conversion
DT[, `:=`(
  dateSold = as.POSIXct(dateSold, format="%Y-%m-%d %H:%M:%S", tz="UTC"),
  dateSoldString = as.Date(dateSoldString),
  timestamp = as.POSIXct(timestamp, format="%Y-%m-%d %H:%M:%S", tz="UTC")
)]

# Write cleaned file
fwrite(DT, "Clean_Data.csv")

# Turn back into a nice R table
data <- as_tibble(DT)

glimpse(data)

#Further Cleaning
data_clean <- data |> 
  select(-sold_to_list_ratio,
         -num_of_applications,
         -num_of_contacts,
         -payment_breakdown,
         -publish_url,
         -tags,
         -originating_mls,
         -special_offer,
         -mls_id,
         -availability_date,
         -open_house_details,
         -unit_number,
         -unit_amenities,
         -contingent_listing_type,
         -management_company_phone_number,
         -is_listed_by_management_company,
         -citySearchUrl,
         -isOffMarket,
         -url,
         -isRentalsLeadCapMet,
         -is_showcased,
         -isFeatured,
         -listingTypeDimension,
         -`resoFacts:sewer`,
         -`resoFacts:waterSource`,
         -photos,
         -virtualTourUrl,
         -tourEligibility,
         -hdpTypeDimension,
         -isRentalListingOffMarket,
         -totalCount,
         -zestimateMinus30,-restimateMinus30,
         -restimateHighPercent,-zestimateHighPercent,
         -zestimateLowPercent,-restimateLowPercent,
         -isListingClaimedByCurrentSignedInUser)

fwrite(data_clean, "Cleaned_Reduced_Data.csv")

data_clean <- data_clean |> 
  select(-livingAreaUnits,
         -parcelId,
         -taxHistory,
         -priceHistory,
         -nearbyHomes,
         -isInstantOfferEnabled,
         -zillowOfferMarket,
         -nearbyZipcodes,
         -rentalApplicationsAcceptedType,
         -brokerageName,
         -climate_risks,
         -selfTour,
         -timeZone)

# This is the BIG one
description <- data_clean |> 
  select(description)

data_clean <- data_clean |> 
  select(-currency,
         -sqft,
         -hideZestimate,
         -dateSoldString,
         -isHousingConnector,
         -countyFIPS,
         -homeValuation,
         -postingContact,
         -description,
         -financial,
         -zestimate_history,
         -country,
         -homeType)

fwrite(data_clean, "Data.1.0.0.csv")


#Test
view(data_clean)

#####
#EDA
library(readr)
library(tidyverse)
library(dplyr)
library(fable)
library(tsibble)
library(scales)
Data_1_0_0 <- read_csv("~/Documents/DATA400/Big Set Manipulation/Data.1.0.0.csv")

View(Data_1_0_0)


#Visuals
df <- Data_1_0_0

#Price By Bedroom Visual
df |> 
  filter(bedrooms <= 12) |> 
  group_by(bedrooms) |> 
  summarise(avg_price = mean(price, na.rm = TRUE))|> 
  ggplot( aes(x = bedrooms, y = avg_price)) +
  geom_line(color = "midnightblue") +
  scale_x_continuous(breaks = seq(min(df$bedrooms), max(df$bedrooms), by = 1)) +
  scale_y_continuous(breaks = breaks_width(100000),labels = label_comma()) +
  labs(title = "Average Price By Bed", x = "Bed", y = "Average Price") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.50),
        plot.subtitle = element_text(hjust = 0.50),
        #axis.text.x = element_text(hjust = 1, angle = 45),
        legend.position = 'bottom')

#Price By Bathroom Visual
df |> 
  filter(bathrooms <= 10) |> 
  group_by(bathrooms) |> 
  summarise(avg_price = mean(price, na.rm = TRUE))|> 
  ggplot( aes(x = bathrooms, y = avg_price)) +
  geom_line(color = "midnightblue") +
  scale_x_continuous(breaks = seq(min(df$bathrooms), max(df$bathrooms), by = 1)) +
  scale_y_continuous(breaks = breaks_width(100000), max(2000000),labels = label_comma()) +
  labs(title = "Average Price By Bath", x = "Bath", y = "Average Price") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.50),
        plot.subtitle = element_text(hjust = 0.50),
        #axis.text.x = element_text(hjust = 1, angle = 45),
        legend.position = 'bottom')

#Top 10 cities for mansions
Data_1_0_0 |> 
  filter(lotAreaUnits == "Square Feet",livingArea > 5000) |> 
  group_by(city) |> 
  count() |> 
  arrange(desc(n)) |> 
  print(n = 10)
  
  
#Bed Count
Data_1_0_0 |> 
  group_by(bedrooms) |> 
  count() |> 
  arrange(desc(n)) |> 
  print(n = 10)

#Bed avg_Price
Data_1_0_0 |>
  mutate(bedroom_range = case_when(
      bedrooms <= 2 ~ "1–2",
      bedrooms <= 4 ~ "3–4",
      bedrooms <= 6 ~ "5–6",
      bedrooms >= 7 ~ "7+"
    )) |>
  group_by(bedroom_range) |>
  summarise(avg_price = mean(price, na.rm = TRUE))

#Bath Count
Data_1_0_0 |> 
  group_by(bathrooms) |> 
  count() |> 
  arrange(desc(n)) |> 
  print(n = 10)

#Price Range Count (GArbage)
Data_1_0_0 |>
  mutate(price_range = cut(price, breaks = seq(0, max(price), by = 50000))) |>
  count(price_range)

# Housing Prices are Heavily Right Skewes
df |> 
  ggplot(aes(x = price)) +
  geom_histogram(binwidth = 50000, fill = "midnightblue", color = "black") +
  labs(title = "Distribution of Housing Prices",
       x = "Price in USD",
       y = "Count") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.50),
        plot.subtitle = element_text(hjust = 0.50),
        #axis.text.x = element_text(hjust = 1, angle = 45),
        legend.position = 'bottom')

#Scatter plot of sqft vs $
df |> 
  filter(lotAreaUnits == "Square Feet",livingArea < 5000) |> 
  ggplot(aes(x = livingArea, y = price)) +
  geom_point(alpha = 0.5) +              # points with transparency
  geom_smooth(method = "lm", color = "red") + # regression line
  labs(title = "Price vs. Square Footage",
       x = "Square Footage",
       y = "Price in USD") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.50),
        plot.subtitle = element_text(hjust = 0.50),
        #axis.text.x = element_text(hjust = 1, angle = 45),
        legend.position = 'bottom')

# Scatter plot of houses by build year
df |> 
  #filter(lotAreaUnits == "Square Feet",livingArea < 5000) |> 
  filter(yearBuilt >= 1900) |> 
  ggplot(aes(x = yearBuilt, y = price)) +
  geom_point(alpha = 0.5) +              # points with transparency
  geom_smooth(method = "lm", color = "red") + # regression line
  scale_x_continuous(breaks = 25) +
  labs(title = "Price vs. Year Built",
       x = "Year Built",
       y = "Price in USD") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.50),
        plot.subtitle = element_text(hjust = 0.50),
        axis.text.x = element_text(hjust = 1, angle = 45),
        legend.position = 'bottom')

glimpse(df)

#Convert yearBuilt to Date and Calculate house age
df$yearBuiltDate <- as.Date(paste0(df$yearBuilt, "-01-01"))
df$houseAge <- as.numeric(format(Sys.Date(), "%Y")) - df$yearBuilt

# Convert dateSold to Date
df$dateSold <- ymd(df$dateSold)  
#####
# Coorelation matrix to try to find 'best' predictor
library(ggplot2)
library(reshape2)  
library(corrplot) 
# Select numeric features
num_vars <- df |> 
  select(price, livingArea, yearBuilt, bedrooms, bathrooms)

# Compute correlation matrix
cor_matrix <- cor(num_vars, use = "complete.obs")

# Print matrix
print(cor_matrix)

# Visualize correlation matrix
corrplot(cor_matrix, method = "color", addCoef.col = "black", number.cex = 0.8,
         tl.cex = 0.8, tl.col = "black", order = "hclust")
#####
# FURTHER CLEANING
df |> 
  select(houseAge) |> 
  group_by(houseAge) |> 
  count() |> 
  arrange(desc(n))

df |> 
  select(yearBuilt) |> 
  group_by(yearBuilt) |> 
  count() |> 
  arrange(desc(n))

#Summary Stats
df |> 
  summarise(mean(price))

df |> 
  summarise(mean(bedrooms))

df |> 
  summarise(mean(bathrooms))

#Weird Break Here FIX
df |> 
  summarise(mean(livingArea))

df |> 
  select(utilities) |> 
  group_by(utilities) |> 
  count() |> 
  arrange(desc(n))

df |> 
  group_by(county) |> 
  count() |> 
  arrange(desc(n))

df <- df |> 
  select(-financial,
         -zestimate_history,
         -country,
         -homeType,
         -propertyTypeDimension,
         -abbreviatedAddress)

df |> 
  group_by(dateSold) |> 
  count() |> 
  arrange(desc(n))


#Save Point
write_csv(df, "Data.1.0.1.csv")
fwrite(df, "Data.1.0.1.csv")

#####
# Parsing out the JSON Files first attempts
library(jsonlite)
library(purrr)

df$getting_around
df_clean <- gsub('""', '"', df$getting_around)

parsed <- map(df_clean, fromJSON)

write_csv(df, "Data.1.0.2.csv")
fwrite(df, "Data.1.0.2.csv")

df <- read.csv("Data.1.0.2.csv")

# overview, listing_provided_by
df <- df |> 
  select(-listing_provided_by)

#####
#1
safe_parse <- function(x) {
  if (is.na(x) || x == "") return(NULL)
  x <- gsub('""', '"', x)
  tryCatch(fromJSON(x), error = function(e) NULL)
  }

df |> 
  select(getting_around)


parsed <- lapply(df_clean, safe_parse)
df$bike_score <- sapply(parsed, function(x) x$bike$score)
df$walk_score <- sapply(parsed, function(x) x$walk$score)
df$transit_score <- sapply(parsed, function(x) x$transit$score)


#####
# Further checking of VARS
# Use to check understanding of edge cases of certain variables
df |> 
  select(livingAreaUnitsShort) |> 
  #filter(is.na(interior)) |> 
  group_by(livingAreaUnitsShort) |> 
  count() |> 
  arrange(desc(n))
#####
#Drop more useless columns: financial_listing_details, base_rent
df <- df |> 
  select(-financial_listing_details, -base_rent)
#####

# Mistake fixer (Fix to your file_path)
# Might get a little rocky from here on
df_untouch <- read_csv("~/Documents/DATA400/Big Set Manipulation/Data.1.0.1.csv")

#####
# Proper JSON work

library(tidyverse)
library(readr)
library(dplyr)
library(jsonlite)
library(purrr)
library(tidyr)
library(stringr)

safe_parse <- function(x) {
  if (is.null(x) || is.na(x) || x == "") return(NULL)
  x <- gsub('""', '"', x)
  tryCatch(fromJSON(x), error = function(e) NULL)
}

# More Robust Version
safer_parse <- function(x) {
  # Handle empty / missing
  if (length(x) == 0 || is.null(x) || is.na(x) || identical(x, "")) {
    return(NULL)
  }
  
  # If already parsed, return as-is
  if (is.list(x)) return(x)
  
  # Ensure character
  if (!is.character(x)) return(NULL)
  
  # Fix escaped quotes safely
  x <- gsub('""', '"', x, fixed = TRUE)
  
  # Try parsing
  out <- tryCatch(
    jsonlite::fromJSON(x),
    error = function(e) NULL
  )
  return(out)
}



json_cols <- c(
  "schools", "nearbyCities", "utilities",
  "getting_around", "interior", "property",
  "construction", "community_details",
  "listing_provided_by", "hoa_details", "getting_around_scores"
)

df_parsed <- df |> 
  mutate(across(any_of(json_cols), ~ map(., safer_parse)))

df_parsed$getting_around_scores[[1]]

#Skip SCHOOL For now
#####
# Skip school if using SQL server to section out schools
#Schools
df_parsed <- df_parsed |>
  mutate(
    min_school_distance = map_dbl(schools, ~ {
      if (is.null(.x) || nrow(.x) == 0 || all(is.na(.x$distance))) return(NA_real_)
      min(.x$distance, na.rm = TRUE)
    }),
    
    num_schools = map_int(schools, ~ {
      if (is.null(.x) || nrow(.x) == 0) return(0)
      nrow(.x)
    })
  )
#####
# Parsing out the important stuff
#Cities
df_parsed <- df_parsed |>
  mutate(
    num_nearby_cities = map_int(nearbyCities, ~ {
      if (is.null(.x) || length(.x) == 0) return(0)
      nrow(.x)
    }),
    
    closest_city = map_chr(nearbyCities, ~ {
      if (is.null(.x) || nrow(.x) == 0) return(NA_character_)
      .x$name[1] 
    })
  )

# Travel
df_parsed <- df_parsed |>
  mutate(
    walk_score = map_dbl(getting_around_scores, ~ {
      val <- .x$walk_score
      if (is.null(val)) return(NA_real_)
      
      num <- str_extract(val, "\\d+")
      if (is.na(num)) return(NA_real_)
      
      as.numeric(num)
    }),
    bike_score = map_dbl(getting_around_scores, ~ {
      val <- .x$bike_score
      if (is.null(val)) return(NA_real_)
      
      num <- str_extract(val, "\\d+")
      if (is.na(num)) return(NA_real_)
      
      as.numeric(num)
    }),
    transit_score = map_dbl(getting_around_scores, ~ {
      val <- .x$transit_score
      if (is.null(val)) return(NA_real_)
      
      num <- str_extract(val, "\\d+")
      if (is.na(num)) return(NA_real_)
      
      as.numeric(num)
    })
  )


#Interior
df_parsed <- df_parsed |>
  mutate(
    has_fireplace = map_lgl(interior, ~ {
      if (is.null(.x)) return(FALSE)
      any(str_detect(tolower(unlist(.x)), "fireplace"))
    }),
    
    has_basement = map_lgl(other, ~ {
      if (is.null(.x)) return(FALSE)
      any(str_detect(tolower(unlist(.x)), "basement"))
    }),
    
    master_bedroom = map_lgl(interior_full, ~ {
      if (is.null(.x)) return(FALSE)
      any(str_detect(tolower(unlist(.x)), "masterbedroom"))
    })
  )

# Property
df_parsed <- df_parsed |>
  mutate(
    has_pool = map_lgl(property, ~ {
      if (is.null(.x)) return(FALSE)
      any(str_detect(tolower(unlist(.x)), "pool"))
    }),
    
    has_garage = map_lgl(property, ~ {
      if (is.null(.x)) return(FALSE)
      any(str_detect(tolower(unlist(.x)), "garage"))
    }),
    
    parking_spaces = map_dbl(property, ~ {
      if (is.null(.x)) return(NA_real_)
      vals <- unlist(.x)
      num <- str_extract(vals, "\\d+")
      as.numeric(num[!is.na(num)][1])
    })
  )

# Con
df_parsed <- df_parsed |>
  mutate(
    has_new_construction = map_lgl(construction, ~ {
       if (is.null(.x)) return(FALSE)
       any(str_detect(tolower(unlist(.x)), "new"))
     }),
    
    construction_type = map_chr(construction, ~ {
      if (is.null(.x)) return(NA_character_)
      paste(unlist(.x), collapse = " ")
    })
  )

# Commun  WHY did I add this (TOO much caffeine and not enough sleep)
df_parsed <- df_parsed |>
  mutate(
    has_community = map_lgl(community_details, ~ !is.null(.x))
  )

# Hoa
df_parsed <- df_parsed |>
  mutate(
    has_hoa = map_lgl(hoa_details, ~ {
      if (is.null(.x)) return(FALSE)
      .x$has_hoa %||% FALSE
    })
  )

# Utilities
df_parsed <- df_parsed |> 
  mutate(
    has_public_water = str_detect(
      tolower(coalesce(resofacts_water_source, "")), "public"),
    
    has_sewer = str_detect(
      tolower(coalesce(resofacts_sewer, "")), "sewer")
    )

#####
#Final Product
# Will be used for creating the model
model_df <- df_parsed |>
  select(
    zpid,#
    price,#
    dateSold,#
    
    bedrooms,#
    bathrooms,#
    livingArea,#
    lotSize,#
    yearBuilt,#
    zipcode,#
    latitude,#
    longitude,#
    city,#
    county,#
    streetAddress,#
    
    schools,
    #min_school_distance, # SWITCH to distance 
    #num_schools, # and grade levels
    num_nearby_cities,#
    closest_city,#
    
    walk_score,#
    bike_score,#
    transit_score,#
    
    has_fireplace,#
    has_garage,#
    has_basement,#
    has_pool,#
    has_new_construction,#
    has_public_water, # ADD these two
    has_sewer,
    
    has_hoa#
  ) |>
  drop_na(price)

View(model_df)


write_csv(model_df, "Data.1.0.6.csv")

write_csv(model_df, "JsonSchool.csv")

just_schools <- df |> 
  select(schools)

write_csv(just_schools, "Transfer.csv")

Data_1_0_6 <- read_csv("Data.1.0.6.csv")

library(jsonlite)

write_json(just_schools, "just_schools.json")


atttempt <- fromJSON('just_schools.json')
# WORKS YIPPEE
#####
# Check Columns
library(tidyverse)

model_df |> 
  select(bedrooms) |> 
  #filter(is.na(interior)) |> 
  group_by(bedrooms) |> 
  count() |> 
  arrange(bedrooms) |> 
  print(n = 23)


# Improperly counted house statistics that may skew 
df <- df |>
  filter(bedrooms <= 12) 

df <- df |>
  filter(bedrooms > 0) 


df <- df |>
  filter(bathrooms <= 13) 

df <- df |>
  filter(bathrooms > 0) 

#####

#Rank vars
library(corrr)

cor_results <- model_df |> 
  select(where(is.numeric)) |> 
  correlate() |> 
  focus(price) |> 
  arrange(desc(abs(price)))
# Only works with numeric

cor_results

#Fit them
model_simple <- lm(price ~  bedrooms + livingArea + yearBuilt 
                   + has_garage + has_basement + num_nearby_cities, model_df)

county_model <- model_df |> 
  filter(county == 'Milwaukee County')

model_simple_mil <- lm(price ~  bedrooms + livingArea + yearBuilt 
                   + has_garage + has_basement + num_nearby_cities, county_model)

summary(model_simple_mil)

library(lmtest)
# Homoscedasticity
bptest(model_simple_mil)

# Autocorrelation
dwtest(model_simple_mil)

# SOMEHOW WE FAIL BOTH!!!!

# Saev Point
write_csv(model_df, "Data.1.0.5.csv")

# Transfer attempts to keep jsonFiles
saveRDS(model_df, "Data.2.0.1.rds")

save(model_df, file = 'Data.2.0.1.RData')
Data_Attempt <- load("~/Desktop/ECON326/03/05_In_Class_Example/Data.2.0.1.RData")

#####
# New model needed since assuming a linear model is aweful
# Run a ARIMA?
# Pull by month?
# Locational Issue?
# Random Forest, Trees?
# ANOVA?

#####
library(tidyverse)

# Done after cheating discovered 
# story unfolds until the K fold CV
forest_noAdress <- na.omit(forest) |>
  select(-streetAddress, -zpid, -longitude, -latitude, -has_pool, - has_fireplace, - num_nearby_cities, - zipcode)

set.seed(123)

train_idx <- sample(1:nrow(forest_noAdress), 0.8 * nrow(forest_noAdress))

train_data <- forest_noAdress[train_idx, ]
test_data  <- forest_noAdress[-train_idx, ]

# Random Forest
library(randomForest)

rf_model <- randomForest(price ~ ., data = train_data, ntree = 500, importance = TRUE)

predictions <- predict(rf_model, newdata = test_data)

#Evaluate
actual <- test_data$price

# RMSE
rmse <- sqrt(mean((predictions - actual)^2))

# R-squared
r2 <- cor(predictions, actual)^2

rmse
r2

# Meh Plot
plot(actual, predictions,
     xlab = "Actual Price",
     ylab = "Predicted Price",
     main = "Random Forest Predictions")+
  abline(0, 1, col = "red")


# Var importance
importance(rf_model)
varImpPlot(rf_model)

#Interpretation:
#Home size (living area) is the dominant driver of price
#Location variables (county, zipcode, city) are extremely influential
#Home quality/age (yearBuilt, bathrooms) also matter a lot

# Model
rf_model
saveRDS(rf_model, "rf_model.rds")

# Later:
rf_model <- readRDS("rf_model.rds")

#####
forest_drop <- na.omit(forest_noAdress) |>
  select(- has_hoa, - has_new_construction, - has_basement)

set.seed(123)

train_idx <- sample(1:nrow(forest_drop), 0.8 * nrow(forest_drop))

train_drop <- forest_drop[train_idx, ]
test_drop  <- forest_drop[-train_idx, ]

# Random Forest
library(randomForest)

new_rf_model <- randomForest(price ~ ., data = train_drop, ntree = 500, importance = TRUE)

drop_predictions <- predict(new_rf_model, newdata = test_drop)

#Evaluate
drop_actual <- test_drop$price

# RMSE
drop_rmse <- sqrt(mean((drop_predictions - drop_actual)^2))

# R-squared
drop_r2 <- cor(drop_predictions, drop_actual)^2

drop_rmse
drop_r2


# Var importance
importance(new_rf_model)
varImpPlot(new_rf_model)

#####
# Gradient Boosted
install.packages('xgboost')
library(xgboost)

# Remove target from features
X <- model.matrix(price ~ . - 1, data = train_drop)
y <- train_drop$price

dtrain <- xgb.DMatrix(data = X, label = y)


#?xgboost()
# Train model
xgb_model <- xgboost(
  x = X,
  y = y,
  nrounds = 500,
  objective = "reg:squarederror",
  verbose = 0
)

xgb.save(xgb_model, "xgb_model.model")


#Evaluate
actual <- test_drop$price


# RMSE
X_train <- model.matrix(price ~ . -1, train_drop)
X_new   <- model.matrix(price ~ . -1, test_drop)

missing_cols <- setdiff(colnames(X_train), colnames(X_new))

for(col in missing_cols){
  X_new <- cbind(X_new, 0)
  colnames(X_new)[ncol(X_new)] <- col
}

X_new <- X_new[, colnames(X_train)]
boost_predictions <- predict(xgb_model, newdata = X_new)

boost_rmse <- sqrt(mean((boost_predictions - actual)^2))

# R-squared
boost_r2 <- cor(boost_predictions, actual)^2

boost_rmse
boost_r2

# Feature importance
importance_matrix <- xgb.importance(model = xgb_model)
xgb.plot.importance(importance_matrix)
# LOL best mapped when using addresses
# CHEATERRRRRRR
# Fixed Interp(after removing addresses):
# Both models claim that living area is the greatest predictor, 
# same with quality thing like yearBuilt and area(zipcode). Transit are also high


#####
# K-fold CV test
# Ran both models and the gradient boosted model outerforms Random Forest

kfold_model <- forest_drop |> 
  select(- closest_city)

kfold_model <- 
  kfold_model |> 
  filter(livingArea >= 100) 
  #print(n = 100000)

kfold_model <- 
  kfold_model |> 
  filter(price >= 100)

kfold_model <- 
  kfold_model |> 
  filter(lotSize >= 100)

kfold_model <- 
  kfold_model |> 
  filter(lotSize < 1742400)

#K-fold cross validation
library(tidyverse)
library(dplyr)
library(xgboost)
library(randomForest)
library(GPfit)
library(tidymodels)
library(caret)
library(randomForest)
library(xgboost)
library(caret)

set.seed(123)
#set.seed(0218)

# Create 5 folds
folds <- createFolds(kfold_model$price, k = 5, list = TRUE)


# Store results
kfold_rmse <- c()
kfold_r2   <- c()

# Run the models
for(i in 1:5){
  
  cat("Running Fold:", i, "\n")
  
  # Split data
  test_idx  <- folds[[i]]
  train_data <- kfold_model[-test_idx, ]
  test_data <- kfold_model[test_idx, ]
  
  #### XGBOOST ####
  X_train <- model.matrix(price ~ . -1, train_data)
  y_train <- train_data$price
  
  xgb_model <- xgboost(
    x = X_train,
    y = y_train,
    nrounds = 500,
    objective = "reg:squarederror",
    verbose = 0
  )
  
  X_test <- model.matrix(price ~ . -1, test_data)
  
  # Fix missing columns
  missing_cols <- setdiff(colnames(X_train), colnames(X_test))
  
  for(col in missing_cols){
    X_test <- cbind(X_test, 0)
    colnames(X_test)[ncol(X_test)] <- col
  }
  
  X_test <- X_test[, colnames(X_train), drop = FALSE]
  
  xgb_pred <- predict(xgb_model, newdata = X_test)
  
  actual  <- test_data$price
  
  kfold_rmse[i] <- sqrt(mean((xgb_pred - actual)^2))
  kfold_r2[i]   <- cor(xgb_pred, actual)^2
}


# Couldn't get to work to put out pred and actual
#tibble(Predicted_Price = round(predict(xgb_model, newdata = X_test, type = 'response'), 4)) |> 
#  DT::datatable()


#xgb_pred
# Final Results
mean(kfold_rmse)
mean(kfold_r2)

# Save model
xgb.save(xgb_model, "Apr27_model.model")

xgb.importance(model = xgb_model)
#summary(xgb_model)



#####
# Used for setting base level values on the HTML 
model_df |> 
  #select(walk_score) |> 
  summarise(Walk = median(walk_score, na.rm = TRUE))# |> 
  #summarise(Trans = median(transit_score, na.rm = TRUE))# |> 
  #summarise(Bike = median(bike_score, na.rm = TRUE)) 

model_df |> 
  summarise(Average = median(lotSize, na.rm = TRUE))


# Testing smaller model for 
library(xgboost)

# Load saved model
model <- xgb.load("Apr27_model.model")


# Recreate your training columns from kfold_model
X_full <- model.matrix(price ~ . -1, kfold_model)
train_cols <- colnames(X_full)



# None of the below code works, but it was an attempt at testing for singular values
# Create one-row input
new_house <- data.frame(
  bedrooms = 3,
  bathrooms = 2.5,
  livingArea = 1405,
  yearBuilt = 1965,
  lotSize = 16553,
  
  walk_score = 50,
  bike_score = 50,
  transit_score = 50,
  
  city = "Madison",
  county = "Dane",
  
  has_garage = TRUE,
  has_public_water = TRUE,
  has_sewer = TRUE,
  
  dateSold = "2024-04-01"
)

# Convert same types as training
new_house$dateSold <- as.Date(new_house$dateSold)

# Build matrix
new_matrix <- model.matrix(~ . -1, new_house)

# Add missing columns
missing_cols <- setdiff(train_cols, colnames(new_matrix))

for(col in missing_cols){
  new_matrix <- cbind(new_matrix, 0)
  colnames(new_matrix)[ncol(new_matrix)] <- col
}

# Keep only training columns in correct order
new_matrix <- new_matrix[, train_cols, drop = FALSE]

# Predict
pred <- predict(model, new_matrix)

print(pred)