Predicting Boston House Prices and Ionosphere Data Analysis with Machine Learning

Classified in Computers

Written at on English with a size of 5.24 KB.

Boston Housing Data Analysis

library(mlbench)
install.packages("dplyr")
library(dplyr)
library(ggplot2)
library(reshape2)
data("BostonHousing")
housing <- BostonHousing
str(housing)

housing %>%   ggplot(aes(x = medv)) +
  stat_density() +
  labs(x = "Median Value ($1000s)", y = "Density", title = "Density Plot of Median Value House Price in Boston") +
  theme_minimal()

summary(housing$medv)

housing %>%   select(c(crim, rm, age, rad, tax, lstat, medv)) %>%   melt( id.vars = "medv") %>%   ggplot(aes(x = value, y = medv, colour = variable)) +
  geom_point(alpha = 0.7) +
  stat_smooth(aes(colour = "black")) +
  facet_wrap(~variable, scales = "free", ncol = 2) +
  labs(x = "Variable Value", y = "Median House Price ($1000s)") +
  theme_minimal()

library("caret")
set.seed(123) #random number generation
to_train <- createDataPartition(y = housing$medv, p = 0.75, list = FALSE)
to_test<-createDataPartition(y=housing$medv, p=0.25,list=FALSE)
train <- housing[to_train, ]
test <- housing[to_test, ]

first_lm <- lm( medv ~ crim +rm +tax +lstat, data = train)

lm1_rsqu <- summary(first_lm)$r.squared
print(paste("1st linear model has an r-squared value of ", round(lm1_rsqu, 3), sep = ""))
## [1] "1st linear model has an r-squared value of 0.672"
#plot(first_lm)

second_lm <- lm(log(medv) ~ crim +rm + tax +lstat, data = train)

lm2_rsqu <- summary(second_lm)$r.squared
print(paste("Our 2nd linear model has an r-squared value of ", round(lm2_rsqu, 3), sep = ""))

abs(mean(second_lm$residuals))

predicted <- predict(second_lm, newdata = test)
results <- data.frame(predicted = exp(predicted), original = test$medv)

results %>%   ggplot(aes(x = predicted, y = original)) +
  geom_point() +
  stat_smooth() +
  labs(x = "Predicted Values", y = "Original Values", title = "Predicted vs. Original Values") +
  theme_minimal()

Ionosphere Data Analysis with KNN

install.packages("KernelKnn")
data(ionosphere, package = 'KernelKnn')
apply(ionosphere, 2, function(x) length(unique(x)))

ionosphere = ionosphere[, -2]

X = scale(ionosphere[, -ncol(ionosphere)])
y = ionosphere[, ncol(ionosphere)]

y = c(1:length(unique(y)))[ match(ionosphere$class, sort(unique(ionosphere$class))) ]

spl_train = sample(1:length(y), round(length(y) * 0.75))

spl_test = setdiff(1:length(y), spl_train)
str(spl_train)
str(spl_test)

acc = function (y_true, preds) {
  
  out = table(y_true, max.col(preds, ties.method = "random"))
  
  acc = sum(diag(out))/sum(out)
  
  acc
}

library(KernelKnn)
preds_TEST = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 5 , 
                method = 'euclidean', weights_function = NULL, regression = F,
                  Levels = unique(y))
head(preds_TEST)

preds_TEST_tric = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 10 , 
                    method = 'canberra', weights_function = 'tricube', regression = F,
                    Levels = unique(y))
head(preds_TEST_tric)

norm_kernel = function(W) {
  
  W = dnorm(W, mean = 0, sd = 1.0)
  
  W = W / rowSums(W)
  
  return(W)
}
preds_TEST_norm = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 10 , 
                    method = 'canberra', weights_function = norm_kernel, regression = F, 
                    Levels = unique(y))
head(preds_TEST_norm)

weights_function = 'tricube', regression = F, 

Levels = unique(y), threads = 5)
str(fit_cv_pair1)
fit_cv_pair2 = KernelKnnCV(X, y, k = 9 , folds = 5,method = 'canberra',
                   
                   weights_function = 'epanechnikov', regression = F,
                   
                   Levels = unique(y), threads = 5)
str(fit_cv_pair2)

#Each cross-validated object returns a list of length 2
acc_pair1 = unlist(lapply(1:length(fit_cv_pair1$preds), 
               
                  function(x) acc(y[fit_cv_pair1$folds[[x]]], 
                                  fit_cv_pair1$preds[[x]])))
acc_pair1
cat('accurcay for params_pair1 is :', mean(acc_pair1), ' ')
acc_pair2 = unlist(lapply(1:length(fit_cv_pair2$preds), 
               
                  function(x) acc(y[fit_cv_pair2$folds[[x]]], 
                                  fit_cv_pair2$preds[[x]])))
acc_pair2
cat('accuracy for params_pair2 is :', mean(acc_pair2), ' ')

Entradas relacionadas: