***
Spark or ( R ) samples are for big files which contains thousands of lines.
Also you do not know data and can not play with it.
I put here simplest data set for spark mllib so that one can play and understand what metrics
are effected from which parameters.
It is not for seniors but perfect for beginners of who need to calibrate parameters with simple sets.
***
R has a built in dataset as cars.
carssub
speed dist
1 4 2
2 4 10
3 7 4
4 7 22
5 8 16
6 9 10
7 10 18
8 10 26
9 10 34
10 11 17
11 11 28
12 12 14
13 12 20
14 12 24
15 12 28
16 13 26
17 13 34
18 13 34
19 13 46
20 14 26
21 14 36
22 14 60
23 14 80
24 15 20
25 15 26
26 15 54
27 16 32
28 16 40
29 17 32
30 17 40
We first take a subset of data.
Then we add outliers. (1 and 5 outliers)
Then we see the effect of outliers.
lm : for fitting linear models.
abline : add line to plot
If you look at picture you will see how a line fits to data when no outlier.
When we add only 1 outlier it changes a lot. When 5 is added it gets much more worser.

carssub <- cars[1:30, ] # original data
carssub <- cars[1:30, ] # original data
cars_outliers1 <- data.frame(speed=c(20), dist=c( 218)) # introduce outliers.
cars_outliers5 <- data.frame(speed=c(19,19,20,20,20), dist=c(190, 186, 210, 220, 218)) # introduce outliers.
cars_outliers10 <- data.frame(speed=c(19,19,20,20,20,21,22,23,24,25), dist=c(190, 186, 210, 220, 218,220,224,230,235,240))
cars_outliers15 <- data.frame(speed=c(19,19,20,20,20,21,22,23,24,25,26,27,28,29,30), dist=c(190, 186, 210, 220, 218,220,224,230,235,240,244,245,248,250,252))
cars_outliers20 <- data.frame(speed=c(19,19,20,20,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35), dist=c(190, 186, 210, 220, 218,220,224,230,235,240,244,245,248,250,252,254,256,258,260,262))
cars_total_1 <- rbind(carssub, cars_outliers1) # data with outliers.
cars_total_5 <- rbind(carssub, cars_outliers5) # data with outliers.
cars_total_10 <- rbind(carssub, cars_outliers10) # data with outliers.
cars_total_15 <- rbind(carssub, cars_outliers15) # data with outliers.
cars_total_20 <- rbind(carssub, cars_outliers20) # data with outliers.
par(mfrow=c(2, 3))
plot(carssub$speed, carssub$dist, xlim=c(0, 40), ylim=c(0, 300), main="Pure data", xlab="speed", ylab="dist", pch="*", col="red", cex=2)
abline(lm(dist ~ speed, data=carssub), col="blue", lwd=3, lty=2)
#aykiri gozlemsiz model
plot(cars_total_1$speed, cars_total_1$dist, xlim=c(0, 40), ylim=c(0, 300), main="1 outlier added", xlab="speed", ylab="dist", pch="*", col="red", cex=2)
lm1 <- lm(dist ~ speed, data=cars_total_1)
abline(lm1, col="blue", lwd=3, lty=2)
summary(lm1)
plot(cars_total_5$speed, cars_total_5$dist, xlim=c(0, 40), ylim=c(0, 300), main="5 outliers added", xlab="speed", ylab="dist", pch="*", col="red", cex=2)
lm2 <- lm(dist ~ speed, data=cars_total_5)
abline(lm2, col="blue", lwd=3, lty=2)
summary(lm2)
plot(cars_total_10$speed, cars_total_10$dist, xlim=c(0, 40), ylim=c(0, 300), main="10 outlier added", xlab="speed", ylab="dist", pch="*", col="red", cex=2)
lm10 <- lm(dist ~ speed, data=cars_total_10)
abline(lm10, col="blue", lwd=3, lty=2)
summary(lm10)
plot(cars_total_15$speed, cars_total_15$dist, xlim=c(0, 40), ylim=c(0, 300), main="15 outlier added", xlab="speed", ylab="dist", pch="*", col="red", cex=2)
lm15 <- lm(dist ~ speed, data=cars_total_15)
abline(lm15, col="blue", lwd=3, lty=2)
summary(lm15)
plot(cars_total_20$speed, cars_total_20$dist, xlim=c(0, 40), ylim=c(0, 300), main="20 outlier added", xlab="speed", ylab="dist", pch="*", col="red", cex=2)
lm20 <- lm(dist ~ speed, data=cars_total_20)
abline(lm20, col="blue", lwd=3, lty=2)
summary(lm20)
no outliers: dist = speed * 2.9 - 6.8
1 outlier: dist = speed * 6.1 -40
5 outliers: dist = speed * 11.4 - 95
You can see that slope and intercept are getting worser values up to 10 but after that
it is changing shape and error function is decreasing. And outliers are also becoming a large group and they
are not outliers..
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.8446 8.7420 -0.783 0.440223
speed 2.9730 0.7046 4.219 0.000233 ***
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -40.015 19.271 -2.076 0.046817 *
speed 6.131 1.515 4.048 0.000351 ***
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -95.203 24.507 -3.885 0.000466 ***
speed 11.437 1.793 6.379 3.18e-07 ***
Call:
lm(formula = dist ~ speed, data = cars_total_1)
Residuals:
Min 1Q Median 3Q Max
-32.210 -15.883 -5.555 6.641 135.398
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -40.015 19.271 -2.076 0.046817 *
speed 6.131 1.515 4.048 0.000351 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 30.63 on 29 degrees of freedom
Multiple R-squared: 0.361, Adjusted R-squared: 0.339
F-statistic: 16.38 on 1 and 29 DF, p-value: 0.0003514
Call:
lm(formula = dist ~ speed, data = cars_total_5)
Residuals:
Min 1Q Median 3Q Max
-67.220 -27.755 -7.473 19.428 86.471
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -95.203 24.507 -3.885 0.000466 ***
speed 11.437 1.793 6.379 3.18e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 43.88 on 33 degrees of freedom
Multiple R-squared: 0.5522, Adjusted R-squared: 0.5386
F-statistic: 40.69 on 1 and 33 DF, p-value: 3.175e-07
Call:
lm(formula = dist ~ speed, data = cars_total_10)
Residuals:
Min 1Q Median 3Q Max
-81.855 -30.503 -0.082 34.346 77.691
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -123.552 20.715 -5.964 6.37e-07 ***
speed 13.965 1.366 10.221 1.85e-12 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 44.15 on 38 degrees of freedom
Multiple R-squared: 0.7333, Adjusted R-squared: 0.7263
F-statistic: 104.5 on 1 and 38 DF, p-value: 1.852e-12
Call:
lm(formula = dist ~ speed, data = cars_total_15)
Residuals:
Min 1Q Median 3Q Max
-78.833 -30.297 -3.225 31.292 71.650
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -114.7213 16.5388 -6.936 1.59e-08 ***
speed 13.2679 0.9684 13.701 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 42.11 on 43 degrees of freedom
Multiple R-squared: 0.8136, Adjusted R-squared: 0.8093
F-statistic: 187.7 on 1 and 43 DF, p-value: < 2.2e-16
Call:
lm(formula = dist ~ speed, data = cars_total_20)
Residuals:
Min 1Q Median 3Q Max
-73.15 -31.29 -6.27 33.74 79.83
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -93.3047 14.5507 -6.412 5.86e-08 ***
speed 11.6738 0.7548 15.466 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 42.92 on 48 degrees of freedom
Multiple R-squared: 0.8329, Adjusted R-squared: 0.8294
F-statistic: 239.2 on 1 and 48 DF, p-value: < 2.2e-16