Convert to long table generating subvariables within a variable with tidyr - r

Convert to long by generating sub-variables within a variable "Variable" colulum I group 1:2 = WW, 34 = MM, and 158:190=EE.
df <- data.frame(A=c("A", "B", "C"), `1`=c("1.9", "6.8", "4.7"), `2`=c("1.9", "6.8", "4.7"), `34`=c("3.9", "0.3", "2.7"), `158`=c("2.9", "3", "45"),`190`=c("22.1", "7.4", "56"), check.names=FALSE)
from my df:
1 2 34 158 190
A 1.9 1.9 3.9 2.9 22.1
B 6.8 6.3 0.3 3 7.4
C 4.7 4.7 2.7 45 56
Desired output
Letter Number Variable Value
A 1 WW 1.9
A 2 WW 1.9
A 34 MM 3.9
A 158 EE 2.9
A 190 EE 22.1
B 1 WW 6.8
B 2 WW 6.8
B 34 MM 0.3
B 158 EE 3
B 190 EE 7.4
...
I tried this but I need to add the new cathegory including MM, MM and EE.
library(tidyr)
data_long <- gather(df, Letter, value, 1:90, factor_key=TRUE)

Does this work:
library(dplyr)
library(tidyr)
df %>% pivot_longer(cols = !A, names_to = 'Number', values_to = 'Value') %>% type.convert(as.is = T) %>%
mutate(Variable = case_when(Number %in% c(1,2) ~ 'WW', Number == 34 ~ 'MM', TRUE ~ 'EE')) %>%
select('Letter' = A, Number, Variable, Value)
Output:
# A tibble: 15 x 4
Letter Number Variable Value
<chr> <int> <chr> <dbl>
1 A 1 WW 1.9
2 A 2 WW 1.9
3 A 34 MM 3.9
4 A 158 EE 2.9
5 A 190 EE 22.1
6 B 1 WW 6.8
7 B 2 WW 6.8
8 B 34 MM 0.3
9 B 158 EE 3
10 B 190 EE 7.4
11 C 1 WW 4.7
12 C 2 WW 4.7
13 C 34 MM 2.7
14 C 158 EE 45
15 C 190 EE 56
>

Related

making a loop creating new vectors in R

I have a dataset of 70 patients. At 6 different datapoints 2 laboratory values were obtained that probably correlate with each other. Here you can see some extracted data
id w2_crp w2_alb w6_crp w6_alb w10_crp w10_alb
001 1.2 35 1.1 38 0.5 39
002 10 27 0.5 42.5 0.5 40
003 2.4 30 1.7 30 1.2 32
004 0.5 37.4 0.7 38.2 0.5 35.5
For each patient I want to plot crp values on x-axis and albumin values on y axis at corresponding timepoints.
I made these vectors for 10 first IDs:
vec1 <- pull(df, w2_crp)
vec2 <- pull(df, w2_alb)
...
crp1 <- (c(first(vec1)), (first(vec3)), (first(vec5)))
and similar vectors for albumin and plotted them normally with
plot_ly(df, x ~crp1, y ~alb1, type = "scatter", mode = "lines")
but this is obviously very tedious. Do you have any ideas how to automize creating vectors and plotting them against each other with a for loop? I tried but constantly got errors... I would be grateful for your help!
I assume that the number in the column name is the timepoint. If so, you could do this:
library(tidyverse)
#example data
dat <- read_table("id w2_crp w2_alb w6_crp w6_alb w10_crp w10_alb
001 1.2 35 1.1 38 0.5 39
002 10 27 0.5 42.5 0.5 40
003 2.4 30 1.7 30 1.2 32
004 0.5 37.4 0.7 38.2 0.5 35.5")
#make each timepoint a row and each group a column
new_dat <- dat |>
pivot_longer(-id, names_pattern="\\w(\\d+)_(\\w+)",
names_to = c("time", "type")) |>
pivot_wider(names_from = type, values_from = value)
new_dat
#> # A tibble: 12 x 4
#> id time crp alb
#> <chr> <chr> <dbl> <dbl>
#> 1 001 2 1.2 35
#> 2 001 6 1.1 38
#> 3 001 10 0.5 39
#> 4 002 2 10 27
#> 5 002 6 0.5 42.5
#> 6 002 10 0.5 40
#> 7 003 2 2.4 30
#> 8 003 6 1.7 30
#> 9 003 10 1.2 32
#> 10 004 2 0.5 37.4
#> 11 004 6 0.7 38.2
#> 12 004 10 0.5 35.5
#plot data
new_dat|>
ggplot(aes(crp, alb, color = time))+
geom_point()+
facet_wrap(~id, scales = "free")

Calculate Percentage by Group with multiple columns in R

I have several data frames with monthly data, I would like to find the percentage distribution for each product and for each month. I have problem with multiple columns with months. Currently, I can only get a percentage by group for one month.
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12))
data_new1 <- transform(data,
perc = ave(January,
group,
FUN = prop.table))
data_new1$perc<-round(data_new1$perc, 2)
> data_new1
group Product January February perc
1 A a 12 16 0.05
2 A b 73 75 0.32
3 A c 78 11 0.34
4 A d 65 35 0.29
5 B a 86 63 0.36
6 B b 33 71 0.14
7 B c 92 49 0.38
8 B d 30 60 0.12
9 C a 91 59 0.37
10 C b 31 45 0.12
11 C c 99 7 0.40
12 C d 28 50 0.11
tidyverse
library(dplyr)
data %>%
group_by(group) %>%
mutate(across(c("January", "February"), proportions, .names = "{.col}_perc")) %>%
ungroup()
# A tibble: 12 x 6
group Product January February January_perc February_perc
<chr> <chr> <int> <int> <dbl> <dbl>
1 A a 49 40 0.426 0.252
2 A b 1 3 0.00870 0.0189
3 A c 19 50 0.165 0.314
4 A d 46 66 0.4 0.415
5 B a 61 82 0.218 0.285
6 B b 88 51 0.314 0.177
7 B c 32 75 0.114 0.260
8 B d 99 80 0.354 0.278
9 C a 6 31 0.0397 0.373
10 C b 8 5 0.0530 0.0602
11 C c 92 20 0.609 0.241
12 C d 45 27 0.298 0.325
base
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12))
tmp <- sapply(c("January", "February"), function (x) ave(data[[x]], data$group, FUN = prop.table))
colnames(tmp) <- paste0(colnames(tmp), "_perc")
res <- cbind(data, tmp)
res
#> group Product January February January_perc February_perc
#> 1 A a 42 73 0.18260870 0.238562092
#> 2 A b 67 92 0.29130435 0.300653595
#> 3 A c 58 90 0.25217391 0.294117647
#> 4 A d 63 51 0.27391304 0.166666667
#> 5 B a 48 15 0.21621622 0.081521739
#> 6 B b 16 82 0.07207207 0.445652174
#> 7 B c 80 75 0.36036036 0.407608696
#> 8 B d 78 12 0.35135135 0.065217391
#> 9 C a 81 16 0.32793522 0.117647059
#> 10 C b 83 81 0.33603239 0.595588235
#> 11 C c 11 1 0.04453441 0.007352941
#> 12 C d 72 38 0.29149798 0.279411765
Created on 2021-12-20 by the reprex package (v2.0.1)
data.table
library(data.table)
COLS <- c("January", "February")
COLS_RES <- paste0(COLS, "_perc")
setDT(data)[, (COLS_RES) := lapply(.SD, proportions), by = group, .SDcol = COLS][]
These calculations are easier if your data is structured in a tidy way. In your case, January and February should probably be one single variable called month or something.
Example:
Underneath, I use tidyr::pivot_longer() to combine January and February into one column. Then I use the package dplyr to group the dataframe and calculate perc. I'm not using prop.table(), but I believe you just want the proportion of observation to the total of that group and month.
library(dplyr)
library(tidyr)
# To make the sampling underneath reproducable
set.seed(1)
data <- data.frame(
group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12)
)
data %>%
pivot_longer(c(January, February), names_to = "month", values_to = "x") %>%
group_by(group, month) %>%
mutate(
perc = round(x/sum(x), 2)
)
I hope this is what you were looking for.
Another dplyr solution:
library(dplyr)
data %>%
group_by(group) %>%
mutate(across(c(2:5),
~./sum(.)*100, .names = "{.col}_pct"))
# A tibble: 12 × 10
# Groups: group [3]
group Product Jan Feb Mar May Jan_pct Feb_pct Mar_pct May_pct
<chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 A a 14 14 95 50 8 18.4 44.4 20.9
2 A b 100 33 28 32 57.1 43.4 13.1 13.4
3 A c 11 16 13 95 6.29 21.1 6.07 39.7
4 A d 50 13 78 62 28.6 17.1 36.4 25.9
5 B a 29 42 72 13 22.0 33.9 20.3 7.07
6 B b 3 4 88 41 2.27 3.23 24.9 22.3
7 B c 30 68 94 86 22.7 54.8 26.6 46.7
8 B d 70 10 100 44 53.0 8.06 28.2 23.9
9 C a 4 88 45 84 3.96 43.6 24.2 30.7
10 C b 52 12 26 55 51.5 5.94 14.0 20.1
11 C c 26 20 23 57 25.7 9.90 12.4 20.8
12 C d 19 82 92 78 18.8 40.6 49.5 28.5
Data:
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
Jan = sample(1:100,12),
Feb = sample(1:100,12),
Mar = sample(1:100, 12),
May = sample(1:100, 12))

Remove rows based on condition R

I have a data as like this
Name Group Heath BP PM
QW DE23 20 60 10
We Fw34 0.5 42 2.5
Sd Kl78 0.4 0.1 0.5
Op Ss14 43 45 96
I need to remove all the rows if that values are less than 1.8
I used following command
data[colSums(data)>=1.8]
data[,colSums(data)>=1.8, drop=FALSE]
subset(data, select=colSums(data) >=1.8)
But I got error as like this "Error in colSums(data) : 'x' must be numeric"
Expected out put
Name Group Heath BP PM
QW DE23 20 60 10
We Fw34 0.5 42 2.5
Op Ss14 43 45 96
You can use to select rows where their sum is >=1.8:
data[rowSums(data[-1:-2])>=1.8,]
# Name Group Heath BP PM
#1 QW DE23 20.0 60 10.0
#2 We Fw34 0.5 42 2.5
#4 Op Ss14 43.0 45 96.0
or where any element in the row is >=1.8:
data[rowSums(data[-1:-2]>=1.8)>0,]
# Name Group Heath BP PM
#1 QW DE23 20.0 60 10.0
#2 We Fw34 0.5 42 2.5
#4 Op Ss14 43.0 45 96.0
data[-1:-2] select the numeric columns.
Here is a tidyverse solution:
library(tidyverse)
df <- tibble::tribble(
~Name,~Group,~Heath,~BP,~PM,
"QW", "DE23",20,60,10,
"We", "Fw34",0.5,42,2.5,
"Sd", "Kl78",0.4,0.1,0.5,
"Op", "Ss14",43,45,96
)
df %>%
filter_if(is.numeric,any_vars(.>=1.8))
#> # A tibble: 3 x 5
#> Name Group Heath BP PM
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 QW DE23 20 60 10
#> 2 We Fw34 0.5 42 2.5
#> 3 Op Ss14 43 45 96
Created on 2020-12-07 by the reprex package (v0.3.0)
The easiest way is to use the filter() function from dplyr package in combination with select to automatically detect numeric columns:
library(dplyr)
df <- data.frame(Name = c("QW", "We", "Sd", "Op"),
Group = c("DE23", "Fw34", "Kl78", "Ss14"),
Heath = c(20, 0.5, 0.4, 43),
BP = c(60, 42, 0.1, 45),
PM = c(10, 2.5, 0.5, 96))
df %>% filter(rowSums(select_if(., is.numeric)) >= 1.8)
Name Group Heath BP PM
1 QW DE23 20.0 60 10.0
2 We Fw34 0.5 42 2.5
3 Op Ss14 43.0 45 96.0
An option with Reduce from base R
df[Reduce(`|`, lapply(df[-(1:2)], `>=`, 1.8)),]
# Name Group Heath BP PM
#1 QW DE23 20.0 60 10.0
#2 We Fw34 0.5 42 2.5
#4 Op Ss14 43.0 45 96.0

How to calculate House distance with euclidean distance between two set of points (coordinates) with R

I am trying to calculate euclidean distance between House a and x, b and x, ... from a table. This is my data look like:
df <- data.frame(house=c(letters[1:10],"x"),long=c(11,15,19,18,16,23,25,21,23,29,19),
lat=c(26,29,28,30,26,25,22,24,25,24,25),
location=(c(rep("city", 5),rep("district", 5), "null")))
I have tried to calculate with euclid formula:
euclid<- function(x1,x2, y1,y2) {
euclid= sqrt((x1-x2)^2+(y1-y2)^2)
return(euclid)
}
I am looking for this output:
House long lat **Distance to X**
h 21 24 2.24
c 19 28 3
e 16 26 3.16
f 23 25 4
i 23 25 4
d 18 30 5.1
b 15 29 5.66
g 25 22 6.71
a 11 26 8.06
j 29 24 10.05
How would I loop the formula to the long and lat values?
There's also the dist() function. Note the rownames step is there to make the output more readable:
rownames(df) <- df[['house']]
dist(df[, c('long', 'lat')])
# added round(..., 1) to make this output
a b c d e f g h i j
b 5.0
c 8.2 4.1
d 8.1 3.2 2.2
e 5.0 3.2 3.6 4.5
f 12.0 8.9 5.0 7.1 7.1
g 14.6 12.2 8.5 10.6 9.8 3.6
h 10.2 7.8 4.5 6.7 5.4 2.2 4.5
i 12.0 8.9 5.0 7.1 7.1 0.0 3.6 2.2
j 18.1 14.9 10.8 12.5 13.2 6.1 4.5 8.0 6.1
x 8.1 5.7 3.0 5.1 3.2 4.0 6.7 2.2 4.0 10.0
To get your intended output, you can convert the dist class to a matrix and subset:
as.matrix(dist(df[, c('long', 'lat')]))[11, -11]
a b c d e f g h i j
8.1 5.7 3.0 5.1 3.2 4.0 6.7 2.2 4.0 10.0
df$distance_to_x <- as.matrix(dist(df[, c('long', 'lat')]))[11, ]
df
house long lat location distance_to_x
a a 11 26 city 8.062258
b b 15 29 city 5.656854
c c 19 28 city 3.000000
d d 18 30 city 5.099020
e e 16 26 city 3.162278
f f 23 25 district 4.000000
g g 25 22 district 6.708204
h h 21 24 district 2.236068
i i 23 25 district 4.000000
j j 29 24 district 10.049876
x x 19 25 null 0.000000
And if you wanted to use your function as #nicola suggested. Using with() can be helpful as well:
with(df, euclid(long, long[house =='x'], lat, lat[house == 'x']))
Besides the approach with dist() by #Cole, you can use outer() to make it as well, i.e.,
# form complex-valued coordinates
z <- with(df,long + 1i*lat)
# calculate distance between complex numbers
df$distance2x <- as.numeric(abs(outer(z,z,"-"))[which(df$house == "x"),])
such that
> df
house long lat location distance2x
1 a 11 26 city 8.062258
2 b 15 29 city 5.656854
3 c 19 28 city 3.000000
4 d 18 30 city 5.099020
5 e 16 26 city 3.162278
6 f 23 25 district 4.000000
7 g 25 22 district 6.708204
8 h 21 24 district 2.236068
9 i 23 25 district 4.000000
10 j 29 24 district 10.049876
11 x 19 25 null 0.000000
Note: the idea is to form complex-valued coordinates and use abs() over the difference between two houses

Calculate row mean with condition in dplyr

My dataset looks like this:
> head(tempExp)
points.id wc2.0_30s_tavg_01 wc2.0_30s_tavg_02
1 AmsterdamGreenhouses_Calamagrostis eigejos-AM_Nhigh 3.1 3.2
2 AmsterdamGreenhouses_Molinia caerulea-AM_Nhigh 3.1 3.2
3 Bangor_Alnus-ECM/AM_Nlow 3.8 3.6
4 Bangor_Betula_pendula-ECM_Nlow 3.8 3.6
5 Bangor_Fagus-ECM_Nlow 3.8 3.6
6 BioCON_nolegumes_mixed-AM_Nlow -11.8 -7.9
wc2.0_30s_tavg_03 wc2.0_30s_tavg_04 wc2.0_30s_tavg_05 wc2.0_30s_tavg_06 wc2.0_30s_tavg_07
1 5.9 8.3 12.6 15.1 17.1
2 5.9 8.3 12.6 15.1 17.1
3 5.4 7.3 10.3 12.7 14.7
4 5.4 7.3 10.3 12.7 14.7
5 5.4 7.3 10.3 12.7 14.7
6 -1.2 7.2 14.5 19.3 21.8
For each row (id) I need to calculate the mean across the entire row, but only including those columns with value > 5.
require(dplyr)
# simulate a similar data set
set.seed(1984)
x <- rep('',100)
for (i in 1:100)
{x[i] <- paste(sample(c(LETTERS, 0:9), 5, replace = T), collapse = '')}
df <- data.frame(ID = x, v1 = 3*rnorm(100),
v2 = 5+3*rnorm(100),
v3 = sample(1:20, 100, replace = T),
v4 = rpois(100,6),
v5 = rep(15,100))
head(df)
# ID v1 v2 v3 v4 v5
#1 XPNL0 7.839162 -1.341105 12 5 15
#2 5BQ3H -1.241025 7.651719 1 5 15
#3 5AZZH 2.185374 2.186604 6 4 15
#4 AKX7H 3.148868 2.513623 13 5 15
#5 VAW42 2.757498 3.888333 16 5 15
#6 F4UST -1.894727 4.587320 2 2 15
df %>%
mutate(avg =apply(df[,-1], 1,
function(x) mean(x[x >5]))) -> df
head(df)
# ID v1 v2 v3 v4 v5 avg
#1 XPNL0 7.839162 -1.341105 12 5 15 11.61305
#2 5BQ3H -1.241025 7.651719 1 5 15 11.32586
#3 5AZZH 2.185374 2.186604 6 4 15 10.50000
#4 AKX7H 3.148868 2.513623 13 5 15 14.00000
#5 VAW42 2.757498 3.888333 16 5 15 15.50000
#6 F4UST -1.894727 4.587320 2 2 15 15.00000

Resources