Remove rows based on condition R - r

I have a data as like this
Name Group Heath BP PM
QW DE23 20 60 10
We Fw34 0.5 42 2.5
Sd Kl78 0.4 0.1 0.5
Op Ss14 43 45 96
I need to remove all the rows if that values are less than 1.8
I used following command
data[colSums(data)>=1.8]
data[,colSums(data)>=1.8, drop=FALSE]
subset(data, select=colSums(data) >=1.8)
But I got error as like this "Error in colSums(data) : 'x' must be numeric"
Expected out put
Name Group Heath BP PM
QW DE23 20 60 10
We Fw34 0.5 42 2.5
Op Ss14 43 45 96

You can use to select rows where their sum is >=1.8:
data[rowSums(data[-1:-2])>=1.8,]
# Name Group Heath BP PM
#1 QW DE23 20.0 60 10.0
#2 We Fw34 0.5 42 2.5
#4 Op Ss14 43.0 45 96.0
or where any element in the row is >=1.8:
data[rowSums(data[-1:-2]>=1.8)>0,]
# Name Group Heath BP PM
#1 QW DE23 20.0 60 10.0
#2 We Fw34 0.5 42 2.5
#4 Op Ss14 43.0 45 96.0
data[-1:-2] select the numeric columns.

Here is a tidyverse solution:
library(tidyverse)
df <- tibble::tribble(
~Name,~Group,~Heath,~BP,~PM,
"QW", "DE23",20,60,10,
"We", "Fw34",0.5,42,2.5,
"Sd", "Kl78",0.4,0.1,0.5,
"Op", "Ss14",43,45,96
)
df %>%
filter_if(is.numeric,any_vars(.>=1.8))
#> # A tibble: 3 x 5
#> Name Group Heath BP PM
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 QW DE23 20 60 10
#> 2 We Fw34 0.5 42 2.5
#> 3 Op Ss14 43 45 96
Created on 2020-12-07 by the reprex package (v0.3.0)

The easiest way is to use the filter() function from dplyr package in combination with select to automatically detect numeric columns:
library(dplyr)
df <- data.frame(Name = c("QW", "We", "Sd", "Op"),
Group = c("DE23", "Fw34", "Kl78", "Ss14"),
Heath = c(20, 0.5, 0.4, 43),
BP = c(60, 42, 0.1, 45),
PM = c(10, 2.5, 0.5, 96))
df %>% filter(rowSums(select_if(., is.numeric)) >= 1.8)
Name Group Heath BP PM
1 QW DE23 20.0 60 10.0
2 We Fw34 0.5 42 2.5
3 Op Ss14 43.0 45 96.0

An option with Reduce from base R
df[Reduce(`|`, lapply(df[-(1:2)], `>=`, 1.8)),]
# Name Group Heath BP PM
#1 QW DE23 20.0 60 10.0
#2 We Fw34 0.5 42 2.5
#4 Op Ss14 43.0 45 96.0

Related

making a loop creating new vectors in R

I have a dataset of 70 patients. At 6 different datapoints 2 laboratory values were obtained that probably correlate with each other. Here you can see some extracted data
id w2_crp w2_alb w6_crp w6_alb w10_crp w10_alb
001 1.2 35 1.1 38 0.5 39
002 10 27 0.5 42.5 0.5 40
003 2.4 30 1.7 30 1.2 32
004 0.5 37.4 0.7 38.2 0.5 35.5
For each patient I want to plot crp values on x-axis and albumin values on y axis at corresponding timepoints.
I made these vectors for 10 first IDs:
vec1 <- pull(df, w2_crp)
vec2 <- pull(df, w2_alb)
...
crp1 <- (c(first(vec1)), (first(vec3)), (first(vec5)))
and similar vectors for albumin and plotted them normally with
plot_ly(df, x ~crp1, y ~alb1, type = "scatter", mode = "lines")
but this is obviously very tedious. Do you have any ideas how to automize creating vectors and plotting them against each other with a for loop? I tried but constantly got errors... I would be grateful for your help!
I assume that the number in the column name is the timepoint. If so, you could do this:
library(tidyverse)
#example data
dat <- read_table("id w2_crp w2_alb w6_crp w6_alb w10_crp w10_alb
001 1.2 35 1.1 38 0.5 39
002 10 27 0.5 42.5 0.5 40
003 2.4 30 1.7 30 1.2 32
004 0.5 37.4 0.7 38.2 0.5 35.5")
#make each timepoint a row and each group a column
new_dat <- dat |>
pivot_longer(-id, names_pattern="\\w(\\d+)_(\\w+)",
names_to = c("time", "type")) |>
pivot_wider(names_from = type, values_from = value)
new_dat
#> # A tibble: 12 x 4
#> id time crp alb
#> <chr> <chr> <dbl> <dbl>
#> 1 001 2 1.2 35
#> 2 001 6 1.1 38
#> 3 001 10 0.5 39
#> 4 002 2 10 27
#> 5 002 6 0.5 42.5
#> 6 002 10 0.5 40
#> 7 003 2 2.4 30
#> 8 003 6 1.7 30
#> 9 003 10 1.2 32
#> 10 004 2 0.5 37.4
#> 11 004 6 0.7 38.2
#> 12 004 10 0.5 35.5
#plot data
new_dat|>
ggplot(aes(crp, alb, color = time))+
geom_point()+
facet_wrap(~id, scales = "free")

Calculate Percentage by Group with multiple columns in R

I have several data frames with monthly data, I would like to find the percentage distribution for each product and for each month. I have problem with multiple columns with months. Currently, I can only get a percentage by group for one month.
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12))
data_new1 <- transform(data,
perc = ave(January,
group,
FUN = prop.table))
data_new1$perc<-round(data_new1$perc, 2)
> data_new1
group Product January February perc
1 A a 12 16 0.05
2 A b 73 75 0.32
3 A c 78 11 0.34
4 A d 65 35 0.29
5 B a 86 63 0.36
6 B b 33 71 0.14
7 B c 92 49 0.38
8 B d 30 60 0.12
9 C a 91 59 0.37
10 C b 31 45 0.12
11 C c 99 7 0.40
12 C d 28 50 0.11
tidyverse
library(dplyr)
data %>%
group_by(group) %>%
mutate(across(c("January", "February"), proportions, .names = "{.col}_perc")) %>%
ungroup()
# A tibble: 12 x 6
group Product January February January_perc February_perc
<chr> <chr> <int> <int> <dbl> <dbl>
1 A a 49 40 0.426 0.252
2 A b 1 3 0.00870 0.0189
3 A c 19 50 0.165 0.314
4 A d 46 66 0.4 0.415
5 B a 61 82 0.218 0.285
6 B b 88 51 0.314 0.177
7 B c 32 75 0.114 0.260
8 B d 99 80 0.354 0.278
9 C a 6 31 0.0397 0.373
10 C b 8 5 0.0530 0.0602
11 C c 92 20 0.609 0.241
12 C d 45 27 0.298 0.325
base
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12))
tmp <- sapply(c("January", "February"), function (x) ave(data[[x]], data$group, FUN = prop.table))
colnames(tmp) <- paste0(colnames(tmp), "_perc")
res <- cbind(data, tmp)
res
#> group Product January February January_perc February_perc
#> 1 A a 42 73 0.18260870 0.238562092
#> 2 A b 67 92 0.29130435 0.300653595
#> 3 A c 58 90 0.25217391 0.294117647
#> 4 A d 63 51 0.27391304 0.166666667
#> 5 B a 48 15 0.21621622 0.081521739
#> 6 B b 16 82 0.07207207 0.445652174
#> 7 B c 80 75 0.36036036 0.407608696
#> 8 B d 78 12 0.35135135 0.065217391
#> 9 C a 81 16 0.32793522 0.117647059
#> 10 C b 83 81 0.33603239 0.595588235
#> 11 C c 11 1 0.04453441 0.007352941
#> 12 C d 72 38 0.29149798 0.279411765
Created on 2021-12-20 by the reprex package (v2.0.1)
data.table
library(data.table)
COLS <- c("January", "February")
COLS_RES <- paste0(COLS, "_perc")
setDT(data)[, (COLS_RES) := lapply(.SD, proportions), by = group, .SDcol = COLS][]
These calculations are easier if your data is structured in a tidy way. In your case, January and February should probably be one single variable called month or something.
Example:
Underneath, I use tidyr::pivot_longer() to combine January and February into one column. Then I use the package dplyr to group the dataframe and calculate perc. I'm not using prop.table(), but I believe you just want the proportion of observation to the total of that group and month.
library(dplyr)
library(tidyr)
# To make the sampling underneath reproducable
set.seed(1)
data <- data.frame(
group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12)
)
data %>%
pivot_longer(c(January, February), names_to = "month", values_to = "x") %>%
group_by(group, month) %>%
mutate(
perc = round(x/sum(x), 2)
)
I hope this is what you were looking for.
Another dplyr solution:
library(dplyr)
data %>%
group_by(group) %>%
mutate(across(c(2:5),
~./sum(.)*100, .names = "{.col}_pct"))
# A tibble: 12 × 10
# Groups: group [3]
group Product Jan Feb Mar May Jan_pct Feb_pct Mar_pct May_pct
<chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 A a 14 14 95 50 8 18.4 44.4 20.9
2 A b 100 33 28 32 57.1 43.4 13.1 13.4
3 A c 11 16 13 95 6.29 21.1 6.07 39.7
4 A d 50 13 78 62 28.6 17.1 36.4 25.9
5 B a 29 42 72 13 22.0 33.9 20.3 7.07
6 B b 3 4 88 41 2.27 3.23 24.9 22.3
7 B c 30 68 94 86 22.7 54.8 26.6 46.7
8 B d 70 10 100 44 53.0 8.06 28.2 23.9
9 C a 4 88 45 84 3.96 43.6 24.2 30.7
10 C b 52 12 26 55 51.5 5.94 14.0 20.1
11 C c 26 20 23 57 25.7 9.90 12.4 20.8
12 C d 19 82 92 78 18.8 40.6 49.5 28.5
Data:
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
Jan = sample(1:100,12),
Feb = sample(1:100,12),
Mar = sample(1:100, 12),
May = sample(1:100, 12))

Looping linear regression output in a data frame in r

I have a dataset below in which I want to do linear regression for each country and state and then cbind the predicted values in the dataset:
Final data frame after adding three more columns:
I have done it for one country and one area but want to do it for each country and area and put the predicted, upper and lower limit values back in the data set by cbind:
data <- data.frame(country = c("US","US","US","US","US","US","US","US","US","US","UK","UK","UK","UK","UK"),
Area = c("G","G","G","G","G","I","I","I","I","I","A","A","A","A","A"),
week = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5),amount = c(12,23,34,32,12,12,34,45,65,45,45,34,23,43,43))
data_1 <- data[(data$country=="US" & data$Area=="G"),]
model <- lm(amount ~ week, data = data_1)
pre <- predict(model,newdata = data_1,interval = "prediction",level = 0.95)
pre
How can I loop this for other combination of country and Area?
...and a Base R solution:
data <- data.frame(country = c("US","US","US","US","US","US","US","US","US","US","UK","UK","UK","UK","UK"),
Area = c("G","G","G","G","G","I","I","I","I","I","A","A","A","A","A"),
week = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5),amount = c(12,23,34,32,12,12,34,45,65,45,45,34,23,43,43))
splitVar <- paste0(data$country,"-",data$Area)
dfList <- split(data,splitVar)
result <- do.call(rbind,lapply(dfList,function(x){
model <- lm(amount ~ week, data = x)
cbind(x,predict(model,newdata = x,interval = "prediction",level = 0.95))
}))
result
...the results:
country Area week amount fit lwr upr
UK-A.11 UK A 1 45 36.6 -6.0463638 79.24636
UK-A.12 UK A 2 34 37.1 -1.3409128 75.54091
UK-A.13 UK A 3 23 37.6 0.6671656 74.53283
UK-A.14 UK A 4 43 38.1 -0.3409128 76.54091
UK-A.15 UK A 5 43 38.6 -4.0463638 81.24636
US-G.1 US G 1 12 20.8 -27.6791493 69.27915
US-G.2 US G 2 23 21.7 -21.9985147 65.39851
US-G.3 US G 3 34 22.6 -19.3841749 64.58417
US-G.4 US G 4 32 23.5 -20.1985147 67.19851
US-G.5 US G 5 12 24.4 -24.0791493 72.87915
US-I.6 US I 1 12 20.8 -33.8985900 75.49859
US-I.7 US I 2 34 30.5 -18.8046427 79.80464
US-I.8 US I 3 45 40.2 -7.1703685 87.57037
US-I.9 US I 4 65 49.9 0.5953573 99.20464
US-I.10 US I 5 45 59.6 4.9014100 114.29859
We can also use function augment from package broom to get your desired information:
library(purrr)
library(broom)
data %>%
group_by(country, Area) %>%
nest() %>%
mutate(models = map(data, ~ lm(amount ~ week, data = .)),
aug = map(models, ~ augment(.x, interval = "prediction"))) %>%
unnest(aug) %>%
select(country, Area, amount, week, .fitted, .lower, .upper)
# A tibble: 15 x 7
# Groups: country, Area [3]
country Area amount week .fitted .lower .upper
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 US G 12 1 20.8 -27.7 69.3
2 US G 23 2 21.7 -22.0 65.4
3 US G 34 3 22.6 -19.4 64.6
4 US G 32 4 23.5 -20.2 67.2
5 US G 12 5 24.4 -24.1 72.9
6 US I 12 1 20.8 -33.9 75.5
7 US I 34 2 30.5 -18.8 79.8
8 US I 45 3 40.2 -7.17 87.6
9 US I 65 4 49.9 0.595 99.2
10 US I 45 5 59.6 4.90 114.
11 UK A 45 1 36.6 -6.05 79.2
12 UK A 34 2 37.1 -1.34 75.5
13 UK A 23 3 37.6 0.667 74.5
14 UK A 43 4 38.1 -0.341 76.5
15 UK A 43 5 38.6 -4.05 81.2
Here is a tidyverse way to do this for every combination of country and Area.
library(tidyverse)
data %>%
group_by(country, Area) %>%
nest() %>%
mutate(model = map(data, ~ lm(amount ~ week, data = .x)),
result = map2(model, data, ~data.frame(predict(.x, newdata = .y,
interval = "prediction",level = 0.95)))) %>%
ungroup %>%
select(-model) %>%
unnest(c(data, result))
# country Area week amount fit lwr upr
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 US G 1 12 20.8 -27.7 69.3
# 2 US G 2 23 21.7 -22.0 65.4
# 3 US G 3 34 22.6 -19.4 64.6
# 4 US G 4 32 23.5 -20.2 67.2
# 5 US G 5 12 24.4 -24.1 72.9
# 6 US I 1 12 20.8 -33.9 75.5
# 7 US I 2 34 30.5 -18.8 79.8
# 8 US I 3 45 40.2 -7.17 87.6
# 9 US I 4 65 49.9 0.595 99.2
#10 US I 5 45 59.6 4.90 114.
#11 UK A 1 45 36.6 -6.05 79.2
#12 UK A 2 34 37.1 -1.34 75.5
#13 UK A 3 23 37.6 0.667 74.5
#14 UK A 4 43 38.1 -0.341 76.5
#15 UK A 5 43 38.6 -4.05 81.2
And one more:
library(tidyverse)
data %>%
mutate(CountryArea=paste0(country,Area) %>% factor %>% fct_inorder) %>%
split(.$CountryArea) %>%
map(~lm(amount~week, data=.)) %>%
map(predict, interval = "prediction",level = 0.95) %>%
reduce(rbind) %>%
cbind(data, .)
country Area week amount fit lwr upr
1 US G 1 12 20.8 -27.6791493 69.27915
2 US G 2 23 21.7 -21.9985147 65.39851
3 US G 3 34 22.6 -19.3841749 64.58417
4 US G 4 32 23.5 -20.1985147 67.19851
5 US G 5 12 24.4 -24.0791493 72.87915
6 US I 1 12 20.8 -33.8985900 75.49859
7 US I 2 34 30.5 -18.8046427 79.80464
8 US I 3 45 40.2 -7.1703685 87.57037
9 US I 4 65 49.9 0.5953573 99.20464
10 US I 5 45 59.6 4.9014100 114.29859
11 UK A 1 45 36.6 -6.0463638 79.24636
12 UK A 2 34 37.1 -1.3409128 75.54091
13 UK A 3 23 37.6 0.6671656 74.53283
14 UK A 4 43 38.1 -0.3409128 76.54091
15 UK A 5 43 38.6 -4.0463638 81.24636

Convert to long table generating subvariables within a variable with tidyr

Convert to long by generating sub-variables within a variable "Variable" colulum I group 1:2 = WW, 34 = MM, and 158:190=EE.
df <- data.frame(A=c("A", "B", "C"), `1`=c("1.9", "6.8", "4.7"), `2`=c("1.9", "6.8", "4.7"), `34`=c("3.9", "0.3", "2.7"), `158`=c("2.9", "3", "45"),`190`=c("22.1", "7.4", "56"), check.names=FALSE)
from my df:
1 2 34 158 190
A 1.9 1.9 3.9 2.9 22.1
B 6.8 6.3 0.3 3 7.4
C 4.7 4.7 2.7 45 56
Desired output
Letter Number Variable Value
A 1 WW 1.9
A 2 WW 1.9
A 34 MM 3.9
A 158 EE 2.9
A 190 EE 22.1
B 1 WW 6.8
B 2 WW 6.8
B 34 MM 0.3
B 158 EE 3
B 190 EE 7.4
...
I tried this but I need to add the new cathegory including MM, MM and EE.
library(tidyr)
data_long <- gather(df, Letter, value, 1:90, factor_key=TRUE)
Does this work:
library(dplyr)
library(tidyr)
df %>% pivot_longer(cols = !A, names_to = 'Number', values_to = 'Value') %>% type.convert(as.is = T) %>%
mutate(Variable = case_when(Number %in% c(1,2) ~ 'WW', Number == 34 ~ 'MM', TRUE ~ 'EE')) %>%
select('Letter' = A, Number, Variable, Value)
Output:
# A tibble: 15 x 4
Letter Number Variable Value
<chr> <int> <chr> <dbl>
1 A 1 WW 1.9
2 A 2 WW 1.9
3 A 34 MM 3.9
4 A 158 EE 2.9
5 A 190 EE 22.1
6 B 1 WW 6.8
7 B 2 WW 6.8
8 B 34 MM 0.3
9 B 158 EE 3
10 B 190 EE 7.4
11 C 1 WW 4.7
12 C 2 WW 4.7
13 C 34 MM 2.7
14 C 158 EE 45
15 C 190 EE 56
>

Display duplicate records in data.frame and omit single ones

I have been struggling with how to select ONLY duplicated rows of data.frame in R.
For Instance, my data.frame is:
age=18:29
height=c(76.1,77,78.1,78.2,78.8,79.7,79.9,81.1,81.2,81.8,82.8,83.5)
Names=c("John","John","John", "Harry", "Paul", "Paul", "Paul", "Khan", "Khan", "Khan", "Sam", "Joe")
village <- data.frame(Names, age, height)
Names age height
John 18 76.1
John 19 77.0
John 20 78.1
Harry 21 78.2
Paul 22 78.8
Paul 23 79.7
Paul 24 79.9
Khan 25 81.1
Khan 26 81.2
Khan 27 81.8
Sam 28 82.8
Joe 29 83.5
I want to see the result as following:
Names age height
John 18 76.1
John 19 77.0
John 20 78.1
Paul 22 78.8
Paul 23 79.7
Paul 24 79.9
Khan 25 81.1
Khan 26 81.2
Khan 27 81.8
Thanks for your time...
A solution using duplicated twice:
village[duplicated(village$Names) | duplicated(village$Names, fromLast = TRUE), ]
Names age height
1 John 18 76.1
2 John 19 77.0
3 John 20 78.1
5 Paul 22 78.8
6 Paul 23 79.7
7 Paul 24 79.9
8 Khan 25 81.1
9 Khan 26 81.2
10 Khan 27 81.8
An alternative solution with by:
village[unlist(by(seq(nrow(village)), village$Names,
function(x) if(length(x)-1) x)), ]
I find #Sven's answer using duplicated the "tidiest", but you can also do this many other ways. Here are two more:
Use table() and subset by matching the names where the tabulation is > 1 with the names present in the first column:
village[village$Names %in% names(which(table(village$Names) > 1)), ]
Use ave() to "tabulate" in a little different manner, but subset in the same way:
village[with(village, ave(as.numeric(Names), Names, FUN = length) > 1), ]
Alternatively, you can use grouping and summary in a dplyr pipeline.
It's more lines of code and maybe more costly in compute. But, the advantage is that you can find duplicate rows by a composite key of multiple columns, rather than only duplicates from within one column.
library(tidyverse)
a <- c(8, 18, 19, 19, 19, 20, 30, 32, 32)
b <- c(1950, 1965, 1981, 1971, 1981, 1999, 1969, 1994, 1999)
c <- c(1, 2, 3, 4, 5, 6, 7, 8, 9)
df <- data.frame(a, b, c)
df
# Description:df[,3] [9 × 3]
# a
# <dbl>
# b
# <dbl>
# c
# <dbl>
# 8 1950 1
# 18 1965 2
# 19 1981 3
# 19 1971 4
# 19 1981 5
# 20 1999 6
# 30 1969 7
# 32 1994 8
# 32 1999 9
# 9 rows
df[duplicated(df$a) | duplicated(df$a, fromLast = T), ]
# Description:df[,3] [5 × 3]
#
#
# a
# <dbl>
# b
# <dbl>
# c
# <dbl>
# 3 19 1981 3
# 4 19 1971 4
# 5 19 1981 5
# 8 32 1994 8
# 9 32 1999 9
# 5 rows
df[duplicated(df$a, df$b) | duplicated(df$a, df$b, fromLast = T), ]
# Description:df[,3] [5 × 3]
#
#
# a
# <dbl>
# b
# <dbl>
# c
# <dbl>
# 3 19 1981 3
# 4 19 1971 4
# 5 19 1981 5
# 8 32 1994 8
# 9 32 1999 9
# 5 rows
df %>%
group_by(a, b) %>%
summarise(a = a, b = b, c = c, n = n()) %>%
subset(n > 1) %>%
select(a, b, c)
#
# A tibble:2 x 3
# Groups:a, b [1]
# a
# <dbl>
# b
# <dbl>
# c
# <dbl>
# 19 1981 3
# 19 1981 5
# 2 rows
df[duplicated(df, incomparables = c(c)), ]
# Error: argument 'incomparables != FALSE' is not used (yet)
# This error occurs even with no libraries loaded.
I may be missing something in the way duplicated() is used in brackets, but I couldn't figure it out.
Also, dplyr returns a tibble, dropping the index, which may be a drawback for you.
I came up with a solution using nested sapply:
> village_dups =
village[unique(unlist(which(sapply(sapply(village$Names,function(x)
which(village$Names==x)),function(y) length(y)) > 1))),]
> village_dups
Names age height
1 John 18 76.1
2 John 19 77.0
3 John 20 78.1
5 Paul 22 78.8
6 Paul 23 79.7
7 Paul 24 79.9
8 Khan 25 81.1
9 Khan 26 81.2
10 Khan 27 81.8

Resources