I have a dataset below in which I want to do linear regression for each country and state and then cbind the predicted values in the dataset:
Final data frame after adding three more columns:
I have done it for one country and one area but want to do it for each country and area and put the predicted, upper and lower limit values back in the data set by cbind:
data <- data.frame(country = c("US","US","US","US","US","US","US","US","US","US","UK","UK","UK","UK","UK"),
Area = c("G","G","G","G","G","I","I","I","I","I","A","A","A","A","A"),
week = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5),amount = c(12,23,34,32,12,12,34,45,65,45,45,34,23,43,43))
data_1 <- data[(data$country=="US" & data$Area=="G"),]
model <- lm(amount ~ week, data = data_1)
pre <- predict(model,newdata = data_1,interval = "prediction",level = 0.95)
pre
How can I loop this for other combination of country and Area?
...and a Base R solution:
data <- data.frame(country = c("US","US","US","US","US","US","US","US","US","US","UK","UK","UK","UK","UK"),
Area = c("G","G","G","G","G","I","I","I","I","I","A","A","A","A","A"),
week = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5),amount = c(12,23,34,32,12,12,34,45,65,45,45,34,23,43,43))
splitVar <- paste0(data$country,"-",data$Area)
dfList <- split(data,splitVar)
result <- do.call(rbind,lapply(dfList,function(x){
model <- lm(amount ~ week, data = x)
cbind(x,predict(model,newdata = x,interval = "prediction",level = 0.95))
}))
result
...the results:
country Area week amount fit lwr upr
UK-A.11 UK A 1 45 36.6 -6.0463638 79.24636
UK-A.12 UK A 2 34 37.1 -1.3409128 75.54091
UK-A.13 UK A 3 23 37.6 0.6671656 74.53283
UK-A.14 UK A 4 43 38.1 -0.3409128 76.54091
UK-A.15 UK A 5 43 38.6 -4.0463638 81.24636
US-G.1 US G 1 12 20.8 -27.6791493 69.27915
US-G.2 US G 2 23 21.7 -21.9985147 65.39851
US-G.3 US G 3 34 22.6 -19.3841749 64.58417
US-G.4 US G 4 32 23.5 -20.1985147 67.19851
US-G.5 US G 5 12 24.4 -24.0791493 72.87915
US-I.6 US I 1 12 20.8 -33.8985900 75.49859
US-I.7 US I 2 34 30.5 -18.8046427 79.80464
US-I.8 US I 3 45 40.2 -7.1703685 87.57037
US-I.9 US I 4 65 49.9 0.5953573 99.20464
US-I.10 US I 5 45 59.6 4.9014100 114.29859
We can also use function augment from package broom to get your desired information:
library(purrr)
library(broom)
data %>%
group_by(country, Area) %>%
nest() %>%
mutate(models = map(data, ~ lm(amount ~ week, data = .)),
aug = map(models, ~ augment(.x, interval = "prediction"))) %>%
unnest(aug) %>%
select(country, Area, amount, week, .fitted, .lower, .upper)
# A tibble: 15 x 7
# Groups: country, Area [3]
country Area amount week .fitted .lower .upper
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 US G 12 1 20.8 -27.7 69.3
2 US G 23 2 21.7 -22.0 65.4
3 US G 34 3 22.6 -19.4 64.6
4 US G 32 4 23.5 -20.2 67.2
5 US G 12 5 24.4 -24.1 72.9
6 US I 12 1 20.8 -33.9 75.5
7 US I 34 2 30.5 -18.8 79.8
8 US I 45 3 40.2 -7.17 87.6
9 US I 65 4 49.9 0.595 99.2
10 US I 45 5 59.6 4.90 114.
11 UK A 45 1 36.6 -6.05 79.2
12 UK A 34 2 37.1 -1.34 75.5
13 UK A 23 3 37.6 0.667 74.5
14 UK A 43 4 38.1 -0.341 76.5
15 UK A 43 5 38.6 -4.05 81.2
Here is a tidyverse way to do this for every combination of country and Area.
library(tidyverse)
data %>%
group_by(country, Area) %>%
nest() %>%
mutate(model = map(data, ~ lm(amount ~ week, data = .x)),
result = map2(model, data, ~data.frame(predict(.x, newdata = .y,
interval = "prediction",level = 0.95)))) %>%
ungroup %>%
select(-model) %>%
unnest(c(data, result))
# country Area week amount fit lwr upr
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 US G 1 12 20.8 -27.7 69.3
# 2 US G 2 23 21.7 -22.0 65.4
# 3 US G 3 34 22.6 -19.4 64.6
# 4 US G 4 32 23.5 -20.2 67.2
# 5 US G 5 12 24.4 -24.1 72.9
# 6 US I 1 12 20.8 -33.9 75.5
# 7 US I 2 34 30.5 -18.8 79.8
# 8 US I 3 45 40.2 -7.17 87.6
# 9 US I 4 65 49.9 0.595 99.2
#10 US I 5 45 59.6 4.90 114.
#11 UK A 1 45 36.6 -6.05 79.2
#12 UK A 2 34 37.1 -1.34 75.5
#13 UK A 3 23 37.6 0.667 74.5
#14 UK A 4 43 38.1 -0.341 76.5
#15 UK A 5 43 38.6 -4.05 81.2
And one more:
library(tidyverse)
data %>%
mutate(CountryArea=paste0(country,Area) %>% factor %>% fct_inorder) %>%
split(.$CountryArea) %>%
map(~lm(amount~week, data=.)) %>%
map(predict, interval = "prediction",level = 0.95) %>%
reduce(rbind) %>%
cbind(data, .)
country Area week amount fit lwr upr
1 US G 1 12 20.8 -27.6791493 69.27915
2 US G 2 23 21.7 -21.9985147 65.39851
3 US G 3 34 22.6 -19.3841749 64.58417
4 US G 4 32 23.5 -20.1985147 67.19851
5 US G 5 12 24.4 -24.0791493 72.87915
6 US I 1 12 20.8 -33.8985900 75.49859
7 US I 2 34 30.5 -18.8046427 79.80464
8 US I 3 45 40.2 -7.1703685 87.57037
9 US I 4 65 49.9 0.5953573 99.20464
10 US I 5 45 59.6 4.9014100 114.29859
11 UK A 1 45 36.6 -6.0463638 79.24636
12 UK A 2 34 37.1 -1.3409128 75.54091
13 UK A 3 23 37.6 0.6671656 74.53283
14 UK A 4 43 38.1 -0.3409128 76.54091
15 UK A 5 43 38.6 -4.0463638 81.24636
Thank you all for reading this problem.
What i would like to do is multiply my testdata with my index file while matching columns.
So multiplying Dp_water with Dp_water and iterating over all index vars kcal, fat, prot, carbs.
In my test data i have for 10 individuals data on consumption of 4 food groups in grams.
for each individual i would like to calculate the kcal fat prot carb intake.
For each individual i would like to make a new variable
Dp_water_kcal, Dp_coffee_kcal, Dp_soup_kcal , Dp_soda_kcal
Dp_water_fat, Dp_coffee_fat, Dp_soup_fat , Dp_soda_fat
ect...
library(tidyverse)
Sample data
Index file
index <- data.frame(Variable=c("Dp_water","Dp_coffee","Dp_soup","Dp_soda"),
kcal=c(0,10,20,40),
fat=c(0,5,10,15),
prot=c(2,4,6,8),
carbs=c(3,6,9,12))
index <- index %>%
pivot_longer(c(kcal,fat,prot,carbs)) %>%
pivot_wider(names_from = Variable, values_from = value)
> index
# A tibble: 4 x 5
name Dp_water Dp_coffee Dp_soup Dp_soda
<chr> <dbl> <dbl> <dbl> <dbl>
1 kcal 0 10 20 40
2 fat 0 5 10 15
3 prot 2 4 6 8
4 carbs 3 6 9 12
Below subject data consumption of 4 foodgroups.
test_data <- data.frame(Dp_water=c(11:20),
Dp_coffee=c(31:40),
Dp_soup=c(21:30),
Dp_soda=c(41:50),
id=1:10)
Dp_water Dp_coffee Dp_soup Dp_soda id
1 11 31 21 41 1
2 12 32 22 42 2
3 13 33 23 43 3
4 14 34 24 44 4
5 15 35 25 45 5
6 16 36 26 46 6
7 17 37 27 47 7
8 18 38 28 48 8
9 19 39 29 49 9
10 20 40 30 50 10
If i do the following it works. But i would like to do this for all variables and not only kcal. And i would like to be able to keep the id column.
test_data %>%
select(-id) %>%
map2_dfr(., test_data[match(names(.), names(test_data))], ~.x/100 * .y) %>%
set_names(paste0(names(.), "_kcal"))
# A tibble: 10 x 4
Dp_water_kcal Dp_coffee_kcal Dp_soup_kcal Dp_soda_kcal
<dbl> <dbl> <dbl> <dbl>
1 1.21 9.61 4.41 16.8
2 1.44 10.2 4.84 17.6
3 1.69 10.9 5.29 18.5
4 1.96 11.6 5.76 19.4
5 2.25 12.2 6.25 20.2
6 2.56 13.0 6.76 21.2
7 2.89 13.7 7.29 22.1
8 3.24 14.4 7.84 23.0
9 3.61 15.2 8.41 24.0
10 4 16 9 25
Thank you all for any help!
I have a data as like this
Name Group Heath BP PM
QW DE23 20 60 10
We Fw34 0.5 42 2.5
Sd Kl78 0.4 0.1 0.5
Op Ss14 43 45 96
I need to remove all the rows if that values are less than 1.8
I used following command
data[colSums(data)>=1.8]
data[,colSums(data)>=1.8, drop=FALSE]
subset(data, select=colSums(data) >=1.8)
But I got error as like this "Error in colSums(data) : 'x' must be numeric"
Expected out put
Name Group Heath BP PM
QW DE23 20 60 10
We Fw34 0.5 42 2.5
Op Ss14 43 45 96
You can use to select rows where their sum is >=1.8:
data[rowSums(data[-1:-2])>=1.8,]
# Name Group Heath BP PM
#1 QW DE23 20.0 60 10.0
#2 We Fw34 0.5 42 2.5
#4 Op Ss14 43.0 45 96.0
or where any element in the row is >=1.8:
data[rowSums(data[-1:-2]>=1.8)>0,]
# Name Group Heath BP PM
#1 QW DE23 20.0 60 10.0
#2 We Fw34 0.5 42 2.5
#4 Op Ss14 43.0 45 96.0
data[-1:-2] select the numeric columns.
Here is a tidyverse solution:
library(tidyverse)
df <- tibble::tribble(
~Name,~Group,~Heath,~BP,~PM,
"QW", "DE23",20,60,10,
"We", "Fw34",0.5,42,2.5,
"Sd", "Kl78",0.4,0.1,0.5,
"Op", "Ss14",43,45,96
)
df %>%
filter_if(is.numeric,any_vars(.>=1.8))
#> # A tibble: 3 x 5
#> Name Group Heath BP PM
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 QW DE23 20 60 10
#> 2 We Fw34 0.5 42 2.5
#> 3 Op Ss14 43 45 96
Created on 2020-12-07 by the reprex package (v0.3.0)
The easiest way is to use the filter() function from dplyr package in combination with select to automatically detect numeric columns:
library(dplyr)
df <- data.frame(Name = c("QW", "We", "Sd", "Op"),
Group = c("DE23", "Fw34", "Kl78", "Ss14"),
Heath = c(20, 0.5, 0.4, 43),
BP = c(60, 42, 0.1, 45),
PM = c(10, 2.5, 0.5, 96))
df %>% filter(rowSums(select_if(., is.numeric)) >= 1.8)
Name Group Heath BP PM
1 QW DE23 20.0 60 10.0
2 We Fw34 0.5 42 2.5
3 Op Ss14 43.0 45 96.0
An option with Reduce from base R
df[Reduce(`|`, lapply(df[-(1:2)], `>=`, 1.8)),]
# Name Group Heath BP PM
#1 QW DE23 20.0 60 10.0
#2 We Fw34 0.5 42 2.5
#4 Op Ss14 43.0 45 96.0
This question already has answers here:
R Loop for Variable Names to run linear regression model
(2 answers)
Closed 2 years ago.
I have the following dataframe I'm calling "test" and I am trying to run a Bartlett's test and a Kruskal-Wallis test for each "metab" vs the "diagnosis"
> test
Index tube.label age gender diagnosis metab1 metab2 metab3 metab4 metab5 metab6
1 200 73 Male Cancer 6 1.5 2 5 8 1.5
2 201 71 Male Healthy 6 1.5 2 11.5 50 1.5
4 202 76 Male Adenoma 2 1.5 2 5 8 1.5
7 203 58 Female Cancer 2 1.5 2 1.5 2.5 1.5
9 204 73 Male Cancer 2 1.5 2 1.5 8 1.5
12 205 72 Male Healthy 6 1.5 17.8272 13.5 184.2 4.5
13 206 46 Female Cancer 30.0530 1.5 2 21.2 16.6 4.5
14 207 38 Female Healthy 6 1.5 2 12.494 31.59 1.5
15 208 60 Male Cancer 6 1.5 2 13.2 53.2 4.5
16 209 72 Female Cancer 6 1.5 2 1.5 8 1.5
17 210 72 Male Adenoma 6 1.5 2 22.829 102.44 9.069
18 211 52 Male Cancer 6 1.5 2 1.5 8 1.5
19 212 64 Male Healthy 6 1.5 2 1.5 8 1.5
20 213 68 Male Cancer 6 1.5 2 26.685 40.9 4.5
21 214 60 Male Healthy 24.902 1.5 42.443 22.942 498.5 4.5
23 215 70 Female Healthy 6 1.5 2 1.5 19.908 4.5
24 216 42 Female Healthy 6 1.5 2 1.5 17.7 1.5
25 217 72 Male Inflammation 6 1.5 2 1.5 8 1.5
26 218 71 Male Healthy 51 1.5 2 41.062 182.2 11.340
27 219 51 Female Inflammation 2 1.5 2 1.5 8 1.5
I can run them individually and it gives me the proper value:
bartlett.test(metab1 ~ diagnosis, data = test)
Bartlett test of homogeneity of variances
data: metab1 by diagnosis
Bartlett's K-squared = 5.1526, df = 3, p-value = 0.161
kruskal.test(metab1 ~ diagnosis, data = test)
Kruskal-Wallis rank sum test
data: metab1 by diagnosis
Kruskal-Wallis chi-squared = 4.3475, df = 3, p-value = 0.2263
However when I try to run a for loop (I have more than 100 of them to run) I keep getting the following error:
Bartlett error:
testcols <- colnames(test[6:ncol(test)])
for (met in testcols){
bartlett.test(met ~ diagnosis, data = test)
}
>Error in model.frame.default(formula = met ~ diagnosis, data = test) :
variable lengths differ (found for 'diagnosis')
Kruskal-Wallis error:
for(met in testcols){
kruskal.test(met ~ diagnosis,data = test)
}
>Error in model.frame.default(formula = met ~ diagnosis, data = test) :
variable lengths differ (found for 'diagnosis')
Should I be using something else? Thank you for the help!
Try to create formula to apply using reformulate :
cols <- names(test)[6:ncol(test)]
all_test <- lapply(cols, function(x)
bartlett.test(reformulate("diagnosis", x), data = test))
You can do the same with kruskal.test.
This question already has an answer here:
Not able to scrape a second table within a page using rvest
(1 answer)
Closed 4 years ago.
I'm trying to webscrape the final table in https://www.baseball-reference.com/leagues/MLB/2015-standings.shtml
i.e. the "MLB Detailed Standings"
My R code is as follows:
library(XML)
library(httr)
library(plyr)
library(stringr)
url <- paste0("http://www.baseball-reference.com/leagues/MLB/", 2015, "-standings.shtml")
tab <- GET(url)
data <- readHTMLTable(rawToChar(tab$content))
however the it does not seem to pickup the table I want. Looking at the source code it seems as though the table is commented out somehow?
Any help would be great
From the answer MrFlick linked:
library(XML)
library(tidyverse)
library(rvest)
page <- xml2::read_html("https://www.baseball-reference.com/leagues/MLB/2015-standings.shtml")
alt_tables <- xml2::xml_find_all(page,"//comment()") %>% {
#Find only commented nodes that contain the regex for html table markup
raw_parts <- as.character(.[grep("\\</?table", as.character(.))])
# Remove the comment begin and end tags
strip_html <- stringi::stri_replace_all_regex(raw_parts, c("<\\!--","-->"),c("",""),
vectorize_all = FALSE)
# Loop through the pieces that have tables within markup and
# apply the same functions
lapply(grep("<table", strip_html, value = TRUE), function(i){
rvest::html_table(xml_find_all(read_html(i), "//table")) %>%
.[[1]]
})
}
tbl <- alt_tables[[2]]
tbl <- as.tibble(tbl)
tbl
# A tibble: 31 x 23
Rk Tm Lg G W L `W-L%` R RA Rdiff SOS SRS pythWL Luck Inter Home Road ExInn
<int> <chr> <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <int> <chr> <chr> <chr> <chr>
1 1 STL NL 162 100 62 0.617 4 3.2 0.8 -0.3 0.5 96-66 4 11-9 55-26 45-36 8-8
2 2 PIT NL 162 98 64 0.605 4.3 3.7 0.6 -0.3 0.3 93-69 5 13-7 53-28 45-36 12-9
3 3 CHC NL 162 97 65 0.599 4.3 3.8 0.5 -0.3 0.2 90-72 7 10-10 49-32 48-33 13-5
4 4 KCR AL 162 95 67 0.586 4.5 4 0.5 0.2 0.7 90-72 5 13-7 51-30 44-37 10-6
5 5 TOR AL 162 93 69 0.574 5.5 4.1 1.4 0.2 1.6 102-60 -9 12-8 53-28 40-41 8-6
6 6 LAD NL 162 92 70 0.568 4.1 3.7 0.4 -0.3 0.1 89-73 3 10-10 55-26 37-44 6-9
7 7 NYM NL 162 90 72 0.556 4.2 3.8 0.4 -0.4 0 89-73 1 9-11 49-32 41-40 9-6
8 8 TEX AL 162 88 74 0.543 4.6 4.5 0.1 0.2 0.4 83-79 5 11-9 43-38 45-36 5-4
9 9 NYY AL 162 87 75 0.537 4.7 4.3 0.4 0.3 0.8 88-74 -1 11-9 45-36 42-39 4-9
10 10 HOU AL 162 86 76 0.531 4.5 3.8 0.7 0.2 0.9 93-69 -7 16-4 53-28 33-48 8-6
# ... with 21 more rows, and 5 more variables: `1Run` <chr>, vRHP <chr>, vLHP <chr>, `≥.500` <chr>, `<.500` <chr>
>