In this image, I want to arrange my table (on the left-side) to a table (on the right-side) containing 3 rows.
https://drive.google.com/file/d/0B4GgTf6nYI4YMHltWjRkeDhob3M/view?usp=sharing
That is, I have a table like this
0 3 6 9 13 16 31 64
N 100,0 98,7 96,7 97,5 91,2 15,7 0,4 0,6
N1 100,0 102,0 97,8 98,6 89,8 11,0 0,3 0,2
and want to arrange it like this:
Alkanes Time Degradation
N 0 100,0
N 3 98,7
N 6 96,7
N 9 97,5
N 13 91,2
N 16 15,7
N 31 0,4
N 64 0,6
N1 0 100,0
N1 3 102,0
N1 6 97,8
N1 9 98,6
N1 13 89,8
N1 16 11,0
N1 31 0,3
N1 64 0,2
Sample data:
x <- structure(list(X = structure(1:3, .Label = c("N", "N1", "N2"), class = "factor"), X0 = c(100, 100, 100), X3 = c(98.7, 102, 95.1), X6 = c(96.7, 97.8, 94.5), X9 = c(97.5, 98.6, 101), X13 = c(91.2, 89.8, 89.4), X16 = c(15.7, 11, 22.5), X31 = c(0.4, 0.3, 0), X64 = c(0.6, 0.2, 0)), .Names = c("X", "X0", "X3", "X6", "X9", "X13", "X16", "X31", "X64"), class = "data.frame", row.names = c(NA, -3L))
Desired output:
y <- structure(list(Alkanes = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("N", "N1", "N2"), class = "factor"), Time = c(0L, 3L, 6L, 9L, 13L, 16L, 31L, 64L, 0L, 3L, 6L, 9L, 13L, 16L, 31L, 64L, 0L, 3L, 6L, 9L, 13L, 16L, 31L, 64L), Degradation = c(100, 98.7, 96.7, 97.5, 91.2, 15.7, 0.4, 0.6, 100, 102, 97.8, 98.6, 89.8, 11, 0.3, 0.2, 100, 95.1, 94.5, 101, 89.4, 22.5, 0, 0)), .Names = c("Alkanes", "Time", "Degradation"), class = "data.frame", row.names = c(NA, -24L))
Given "x" as:
x
# X X0 X3 X6 X9 X13 X16 X31 X64
# 1 N 100 98.7 96.7 97.5 91.2 15.7 0.4 0.6
# 2 N1 100 102.0 97.8 98.6 89.8 11.0 0.3 0.2
# 3 N2 100 95.1 94.5 101.0 89.4 22.5 0.0 0.0
You can try something like:
as.data.frame(
as.table(
`dimnames<-`(as.matrix(x[-1]), list(x[[1]], gsub("X", "", names(x)[-1])))))
# Var1 Var2 Freq
# 1 N 0 100.0
# 2 N1 0 100.0
# 3 N2 0 100.0
# 4 N 3 98.7
# 5 N1 3 102.0
# 6 N2 3 95.1
# 7 N 6 96.7
# 8 N1 6 97.8
# 9 N2 6 94.5
# 10 N 9 97.5
# 11 N1 9 98.6
# 12 N2 9 101.0
# 13 N 13 91.2
# 14 N1 13 89.8
# 15 N2 13 89.4
# 16 N 16 15.7
# 17 N1 16 11.0
# 18 N2 16 22.5
# 19 N 31 0.4
# 20 N1 31 0.3
# 21 N2 31 0.0
# 22 N 64 0.6
# 23 N1 64 0.2
# 24 N2 64 0.0
From there, it's just sorting and renaming your columns, which are fairly standard operations.
You can try
library(reshape2)
names(my_data) <- sub('[^0-9]+', '', names(my_data))
m1 <- as.matrix(my_data[-1])
row.names(m1) <- my_data[,1]
d1 <- melt(m1)
d2 <- setNames(d1[order(d1$Var1),], c('Alkanes', 'Time', 'Degradation'))
Or
my_data1 <- my_data[-1]
dN <- data.frame(Alkanes= my_data[1][row(my_data1)],
Time= names(my_data1)[col(my_data1)], Degradation=unlist(my_data1))
dN1 <- dN[order(dN[,1]),]
row.names(dN1) <- NULL
Related
I want to find the number of SNPS that have FDR adjusted p-values of p<.05. However, my for loop and if statement did not effectively find the # of SNPs with p<.05.
My dataset has a P column which indicates p-value and 1422 observations.
> dput(dat[1:5,])
structure(list(CHR = c(6L, 6L, 6L, 6L, 6L), SNP = c("rs9257319",
"rs2269553", "rs2894066", "rs3763338", "rs1233508"), BP = c(28959616L,
28984488L, 29001906L, 29002290L, 29005612L), A1 = c(2L, 1L, 1L,
1L, 2L), A2 = c(1L, 2L, 2L, 2L, 1L), T = c(6L, 9L, 13L, 4L, 8L
), U = c(7L, 9L, 9L, 3L, 13L), OR = c(0.8571, 1, 1.444, 1.333,
0.6154), L95 = c(0.2881, 0.397, 0.6174, 0.2984, 0.2551), U95 = c(2.55,
2.519, 3.379, 5.957, 1.485), CHISQ = c(0.07692, 0, 0.7273, 0.1429,
1.19), P = c(0.7815, 1, 0.3938, 0.7055, 0.2752)), row.names = c(NA,
5L), class = "data.frame")
I calculated the q-values using the qvalue library.
library(qvalue)
library(dplyr)
fdr <- qvalue(dat$P, fdr.level=0.05)
Finally, I want to find the number of SNPs with FDR adjusted p-values of p<.05.
# SNPs that have FDR adjusted p-values of p<.05
for(i in fdr$qvalues){
if(i>0.05){
fdr[!fdr$qvalues %in% i]
}
}
And found that there is one q-value > 0.05 and removed it. However, as shown below, the length of fdr$qvalues remain the same, meaning that I did not remove the q-value > 0.05 element.
length(fdr$qvalues)
[1] 1422
library(tidyverse)
# slightly modified p values to see the result
data <- structure(list(CHR = c(6L, 6L, 6L, 6L, 6L), SNP = c(
"rs9257319",
"rs2269553", "rs2894066", "rs3763338", "rs1233508"
), BP = c(
28959616L,
28984488L, 29001906L, 29002290L, 29005612L
), A1 = c(
2L, 1L, 1L,
1L, 2L
), A2 = c(1L, 2L, 2L, 2L, 1L), T = c(6L, 9L, 13L, 4L, 8L), U = c(7L, 9L, 9L, 3L, 13L), OR = c(
0.8571, 1, 1.444, 1.333,
0.6154
), L95 = c(0.2881, 0.397, 0.6174, 0.2984, 0.2551), U95 = c(
2.55,
2.519, 3.379, 5.957, 1.485
), CHISQ = c(
0.07692, 0, 0.7273, 0.1429,
1.19
), P = c(0.001, 1, 0.3, 0.01, 0.5)), row.names = c(
NA,
5L
), class = "data.frame")
data
#> CHR SNP BP A1 A2 T U OR L95 U95 CHISQ P
#> 1 6 rs9257319 28959616 2 1 6 7 0.8571 0.2881 2.550 0.07692 0.001
#> 2 6 rs2269553 28984488 1 2 9 9 1.0000 0.3970 2.519 0.00000 1.000
#> 3 6 rs2894066 29001906 1 2 13 9 1.4440 0.6174 3.379 0.72730 0.300
#> 4 6 rs3763338 29002290 1 2 4 3 1.3330 0.2984 5.957 0.14290 0.010
#> 5 6 rs1233508 29005612 2 1 8 13 0.6154 0.2551 1.485 1.19000 0.500
data %>%
mutate(q = p.adjust(P, method = "fdr")) %>%
filter(q < 0.05)
#> CHR SNP BP A1 A2 T U OR L95 U95 CHISQ P q
#> 1 6 rs9257319 28959616 2 1 6 7 0.8571 0.2881 2.550 0.07692 0.001 0.005
#> 2 6 rs3763338 29002290 1 2 4 3 1.3330 0.2984 5.957 0.14290 0.010 0.025
data %>%
mutate(q = p.adjust(P, method = "fdr")) %>%
filter(q < 0.05) %>%
count()
#> n
#> 1 2
Created on 2022-02-10 by the reprex package (v2.0.0)
Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 1 year ago.
This post was edited and submitted for review 1 year ago and failed to reopen the post:
Needs more focus Update the question so it focuses on one problem only by editing this post.
Improve this question
I have data of the size of 200 individuals of a particular plant species. But the size was measured in an indirect way, counting the number of leaves (discrete data), monthly during a total of 14 months. The germination, growth and death of the plants are very irregular, with some plants having a long life span, other dying quickly, and also the germination in time is irregular: new plants kept germinating during all the study period and being incorporated into the study. Here is an example of my data (numbers inside cells refer to the number of leaves):
Month.1
Month.2
Month.3
Month.4
Month.5
Month.6
Month.7
plant.1
3
21
15
-
-
-
-
plant.2
-
7
14
-
-
-
-
plant.3
-
8
12
10
-
-
-
plant.4
-
-
1
3
5
-
-
plant.5
-
3
6
18
13
4
-
.....
...
...
...
...
...
...
...
Following Shibaprasadb's suggestion, here is a subset of my data, already converted to long format:
df <- dput(df)
structure(list(plant = c(1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 28L, 29L, 29L, 29L, 30L, 31L, 32L, 32L,
32L, 32L, 32L, 62L, 62L, 63L, 64L, 65L, 65L, 65L, 65L, 66L, 67L,
67L, 67L), month = c(4L, 5L, 6L, 7L, 8L, 4L, 7L, 8L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 7L, 6L, 7L, 8L, 6L, 7L, 4L, 5L, 6L, 7L, 8L,
5L, 6L, 6L, 6L, 5L, 6L, 7L, 8L, 8L, 2L, 3L, 4L), time = c(0L,
1L, 2L, 3L, 4L, 0L, 0L, 1L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 0L, 0L,
1L, 2L, 0L, 0L, 0L, 1L, 2L, 3L, 4L, 0L, 1L, 0L, 0L, 0L, 1L, 2L,
3L, 0L, 0L, 1L, 2L), leaves = c(6L, 18L, 9L, 24L, 6L, 12L, 6L,
6L, 63L, 66L, 15L, 9L, 15L, 21L, 12L, 12L, 3L, 42L, 12L, 9L,
3L, 15L, 21L, 18L, 27L, 15L, 21L, 36L, 24L, 21L, 3L, 12L, 18L,
6L, 3L, 15L, 3L, 3L), tray = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L), soil_nutrient = c(62.4,
71, 13.6, 37.6, 13.8, 62.4, 37.6, 13.8, 47.6, 51.8, 62.4, 71,
13.6, 37.6, 13.8, 25.4, 14.2, 25.4, 17, 14.2, 25.4, 51.6, 72.6,
14.2, 25.4, 17, 148.2, 29.4, 29.4, 29.4, 148.2, 29.4, 103.6,
122.4, 122.4, 116.2, 141, 117.6), watering = c(81.6, 89, 10.6,
57.8, 1.2, 81.6, 57.8, 1.2, 46, 48.6, 81.6, 89, 10.6, 57.8, 1.2,
57.8, 10.6, 57.8, 1.2, 10.6, 57.8, 81.6, 89, 10.6, 57.8, 1.2,
89, 10.6, 10.6, 10.6, 89, 10.6, 57.8, 1.2, 1.2, 46, 48.6, 81.6
), UV_treatment = c("n", "n", "n", "n", "n", "n", "n", "n", "n",
"n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n",
"n", "n", "n", "n", "n", "y", "y", "y", "n", "y", "n", "n", "n",
"n", "n", "n")), class = "data.frame", row.names = c(NA, -38L
))
df
plant month time leaves tray soil_nutrient watering UV_treatment
1 1 4 0 6 1 62.4 81.6 n
2 1 5 1 18 1 71.0 89.0 n
3 1 6 2 9 1 13.6 10.6 n
4 1 7 3 24 1 37.6 57.8 n
5 1 8 4 6 1 13.8 1.2 n
6 2 4 0 12 1 62.4 81.6 n
7 3 7 0 6 1 37.6 57.8 n
8 3 8 1 6 1 13.8 1.2 n
9 4 2 0 63 1 47.6 46.0 n
10 4 3 1 66 1 51.8 48.6 n
11 4 4 2 15 1 62.4 81.6 n
12 4 5 3 9 1 71.0 89.0 n
13 4 6 4 15 1 13.6 10.6 n
14 4 7 5 21 1 37.6 57.8 n
15 4 8 6 12 1 13.8 1.2 n
16 28 7 0 12 3 25.4 57.8 n
17 29 6 0 3 3 14.2 10.6 n
18 29 7 1 42 3 25.4 57.8 n
19 29 8 2 12 3 17.0 1.2 n
20 30 6 0 9 3 14.2 10.6 n
21 31 7 0 3 3 25.4 57.8 n
22 32 4 0 15 3 51.6 81.6 n
23 32 5 1 21 3 72.6 89.0 n
24 32 6 2 18 3 14.2 10.6 n
25 32 7 3 27 3 25.4 57.8 n
26 32 8 4 15 3 17.0 1.2 n
27 62 5 0 21 7 148.2 89.0 n
28 62 6 1 36 7 29.4 10.6 y
29 63 6 0 24 7 29.4 10.6 y
30 64 6 0 21 7 29.4 10.6 y
31 65 5 0 3 7 148.2 89.0 n
32 65 6 1 12 7 29.4 10.6 y
33 65 7 2 18 7 103.6 57.8 n
34 65 8 3 6 7 122.4 1.2 n
35 66 8 0 3 7 122.4 1.2 n
36 67 2 0 15 7 116.2 46.0 n
37 67 3 1 3 7 141.0 48.6 n
38 67 4 2 3 7 117.6 81.6 n
plant = the identification number of each individual plant.
month = the month in which the plant germinated.
time = similar to month, but each plant starting from zero.
leaves = number of leaves of each indivial plant.
tray = the tray in which each plant was; there were 10 trays in all, each tray containing a different type of soil.
soil_nutrient = the total amount of nutrients of each tray in each month.
watering = the amount of water added to each tray in each month.
UV_treatment = an aggresive UV treatment that we are interested to explore its effect on the plants.
I'm interested in checking if there is some pattern in their growth, I mean if they first show an increase in the number of leaves, if they then get stable, and if they suddenly die or gradually decrease the number of leaves before dying.
I've been looking for phenology analysis in R, but what I've found so far is mainly related to climate change, global parameters, etc. On the other hand, growth analysis considering time series is focused on a long sequence of data of a particular variable, not several individuals.
library(tidyverse)
library(ggpubr)
library(lmerTest)
#> Loading required package: lme4
#> Loading required package: Matrix
#>
#> Attaching package: 'Matrix'
#> The following objects are masked from 'package:tidyr':
#>
#> expand, pack, unpack
#>
#> Attaching package: 'lmerTest'
#> The following object is masked from 'package:lme4':
#>
#> lmer
#> The following object is masked from 'package:stats':
#>
#> step
data <- tibble::tribble(
~Plant, ~Month.1, ~Moßnth.2, ~Month.3, ~Month.4, ~Month.5, ~Month.6, ~Month.7,
"plant.1", "3", "21", "15", "-", "-", "-", "-",
"plant.2", "-", "7", "14", "-", "-", "-", "-",
"plant.3", "-", "8", "12", "10", "-", "-", "-",
"plant.4", "-", "-", "1", "3", "5", "-", "-",
"plant.5", "-", "3", "6", "18", "13", "4", "-"
)
data <-
data %>%
pivot_longer(-Plant, names_to = "Month", values_to = "n_leafs") %>%
mutate(
n_leafs = n_leafs %>% as.numeric(),
Month = Month %>% str_extract("[0-9]+$") %>% as.numeric(),
Plant = Plant %>% str_extract("[0-9]+$") %>% as.numeric()
) %>%
# Normalization: Just count the number of months since seeding for each Plant
group_by(Plant) %>%
mutate(Month = Month - min(Month)) %>%
ungroup()
#> Warning in n_leafs %>% as.numeric(): NAs introduced by coercion
#
# The overall trend is that the number of leaves are getting lesser over time.
# However, this is not significant.
#
data %>%
ggplot(aes(Month, n_leafs, color = Plant)) +
geom_point() +
stat_smooth(method = "lm") +
stat_cor()
#> `geom_smooth()` using formula 'y ~ x'
#> Warning: Removed 19 rows containing non-finite values (stat_smooth).
#> Warning: Removed 19 rows containing non-finite values (stat_cor).
#> Warning: Removed 19 rows containing missing values (geom_point).
#
# The number of leaves are not significantly different between the months
#
data %>%
ggplot(aes(factor(Month), n_leafs)) +
geom_boxplot() +
stat_compare_means()
#> Warning: Removed 19 rows containing non-finite values (stat_boxplot).
#> Warning: Removed 19 rows containing non-finite values (stat_compare_means).
#
# There is still no significance after controlling for the different plants
#
lmer(n_leafs ~ Month + (1|Plant), data = data) %>%
summary()
#> Linear mixed model fit by REML. t-tests use Satterthwaite's method [
#> lmerModLmerTest]
#> Formula: n_leafs ~ Month + (1 | Plant)
#> Data: data
#>
#> REML criterion at convergence: 96.8
#>
#> Scaled residuals:
#> Min 1Q Median 3Q Max
#> -1.1084 -0.7883 -0.2424 0.6603 1.8827
#>
#> Random effects:
#> Groups Name Variance Std.Dev.
#> Plant (Intercept) 4.789 2.188
#> Residual 34.943 5.911
#> Number of obs: 16, groups: Plant, 5
#>
#> Fixed effects:
#> Estimate Std. Error df t value Pr(>|t|)
#> (Intercept) 8.2657 3.2163 9.1434 2.570 0.0298 *
#> Month 0.3186 1.2149 13.4746 0.262 0.7971
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Correlation of Fixed Effects:
#> (Intr)
#> Month -0.831
Created on 2021-09-26 by the reprex package (v2.0.1)
My data:
library(tidyverse)
library(lme4)
myData <- structure(list(Subjects = c(4L, 3L, 5L, 1L, 9L, 6L, 10L, 2L,
8L, 7L), Gene1 = c(0.318630087617032, -0.58179068471591, 0.714532710891568,
-0.825259425862769, -0.359862131395465, 0.0898861437775305, 0.0962744602851301,
-0.201633952183354, 0.739840499878431, 0.123379501088869), Variant1 = c(1L,
0L, 1L, 2L, 2L, 1L, 0L, 1L, 2L, 0L), Variant2 = c(0L, 0L, 2L,
2L, 0L, 2L, 2L, 2L, 2L, 0L), Variant3 = c(1L, 1L, 0L, 2L, 0L,
1L, 1L, 1L, 2L, 1L), Variant4 = c(1L, 2L, 1L, 0L, 0L, 1L, 0L,
2L, 1L, 1L), Age = c(81L, 60L, 85L, 87L, 76L, 78L, 88L, 64L,
90L, 75L), Sex = c(0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 1L), RIN = c(6L,
6L, 8L, 6L, 8L, 7L, 8L, 7L, 7L, 6L), ABG = structure(c(4L, 5L,
8L, 3L, 6L, 2L, 3L, 4L, 7L, 1L), .Label = c("F1", "F10", "F2",
"F3", "F4", "F5", "F6", "F8"), class = "factor")), row.names = c(NA,
-10L), class = "data.frame", .Names = c("Subjects", "Gene1",
"Variant1", "Variant2", "Variant3", "Variant4", "Age", "Sex",
"RIN", "ABG"))
myData
Subjects Gene1 Variant1 Variant2 Variant3 Variant4 Age Sex RIN ABG
1 4 0.31863009 1 0 1 1 81 0 6 F3
2 3 -0.58179068 0 0 1 2 60 1 6 F4
3 5 0.71453271 1 2 0 1 85 0 8 F8
4 1 -0.82525943 2 2 2 0 87 1 6 F2
5 9 -0.35986213 2 0 0 0 76 0 8 F5
6 6 0.08988614 1 2 1 1 78 1 7 F10
7 10 0.09627446 0 2 1 0 88 0 8 F2
8 2 -0.20163395 1 2 1 2 64 0 7 F3
9 8 0.73984050 2 2 2 1 90 1 7 F6
10 7 0.12337950 0 0 1 1 75 1 6 F1
Gene1 is my dependent variable and Variant1, Variant2, Variant3 and Variant4 are my independent variables. Age, Sex, RIN and ABG are my covariates. Using the tidyverse framework (broom/dplyr/purrr/map) I'd like to iterate through Variant1:Variant4 performing the following linear regressions using a linear mixed model:
lmer(Gene1~Variant1+Age+Sex+RIN+(1|ABG), myData) for Variant1,
lmer(Gene1~Variant2+Age+Sex+RIN+(1|ABG), myData) for Variant2, so on ...
At the end, I'd like generate a results table with beta coefficients (Estimate), Std.Err and pValues for all Variant* (possibly using tidy/augment/glance??).
PS. The number of Variant* my vary.
Thank you!
For your problem you can gather() in order to utilize group_split() on all the different kinds of variants. From that point we can iterate over each split data.frame and run the linear model. Inside of the map() we'll broom::tidy() the data and add a column to distinguish the estimates for each model. I used map_df() to end up with a single dataframe but you can also just use map() to end up with a list of data.frames.
library(tidyverse)
library(lme4)
library(broom)
dat <- tribble(~Subjects, ~Gene1, ~Variant1, ~Variant2, ~Variant3, ~Variant4, ~Age, ~Sex, ~RIN, ~ABG,
1, -0.82525943, 2, 2, 2, 0, 87, 1, 6, "F2",
2, -0.20163395, 1, 2, 1, 2, 64 , 0 , 7, "F3",
3, -0.58179068, 0, 0, 1, 2, 60 , 1 , 6, "F4",
4, 0.31863009, 1, 0, 1, 1, 81 , 0 , 6, "F3",
5, 0.71453271, 1, 2, 0, 1, 85 , 0 , 8, "F8",
6, 0.08988614, 1, 2, 1, 1, 78 , 1 , 7, "F10",
7, 0.12337950, 0, 0, 1, 1, 75 , 1 , 6, "F1",
8, 0.73984050, 2, 2, 2, 1, 90 , 1 , 7, "F6",
9, -0.35986213, 2, 0, 0, 0, 76 , 0 , 8, "F5",
10, 0.09627446, 0, 2, 1, 0, 88, 0, 8, "F2")
dat %>%
gather(key = "variants", value = "var_value", Variant1:Variant4) %>%
group_split(variants) %>%
map_df(~lmer(Gene1~var_value+Age+Sex+RIN+(1|ABG), data = .x) %>%
tidy() %>%
mutate(variant_group = unique(.x$variants)))
#> # A tibble: 28 x 6
#> term estimate std.error statistic group variant_group
#> <chr> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 (Intercept) -3.91 1.96 -2.00 fixed Variant1
#> 2 var_value -0.280 0.120 -2.34 fixed Variant1
#> 3 Age 0.0401 0.00905 4.43 fixed Variant1
#> 4 Sex 0.00107 0.391 0.00274 fixed Variant1
#> 5 RIN 0.161 0.154 1.05 fixed Variant1
#> 6 sd_(Intercept).ABG 0.462 NA NA ABG Variant1
#> 7 sd_Observation.Resid… 0.00234 NA NA Residu… Variant1
#> 8 (Intercept) -3.60 2.74 -1.31 fixed Variant2
#> 9 var_value -0.0625 0.202 -0.309 fixed Variant2
#> 10 Age 0.0295 0.0175 1.69 fixed Variant2
#> # … with 18 more rows
Created on 2019-02-23 by the reprex package (v0.2.1)
I'll start by saying I'm trying to learn r but it doesn't come easy to me. Similar to this post here I am trying to match values in multiple columns from one data frame (df) and then replace those values based on the corresponding columns from the other data frame (df.key). Here is my example df:
name type place ttotal t01 t02 t03 t04 t05 t06 t07 t08 t09
joe cat SE 7 3 2 2 3 2 5 2 0 1
john cat SE 2 0 0 4 0 3 1 3 1 7
sue cat SE 1 2 0 5 0 4 1 4 3 0
jack cat SE 6 3 4 2 2 4 0 2 1 5
Below is my df.key to be used to match the values above in columns df$ttotal to t09 with df.key$class and replace with the values in df.key$mid accordingly:
lo hi class mid
0 0 0 0.0
0 1 1 0.5
1 2 2 3.0
5 10 3 7.5
10 20 4 15.0
20 30 5 25.0
30 40 6 35.0
40 50 7 45.0
so the first row should be:
name type place ttotal t01 t02 t03 t04 t05 t06 t07 t08 t09
joe cat SE 45.0 7.5 3.0 3.0 7.5 3.0 25.0 3.0 0.0 0.5
Here is just one match loop I tried but it populates the came value across the row:
for(i in 1:dim(df)[1]){
for(j in df$4:13) {
df[i,j] <- df.key$mid[match(i, df.key$class)]
}
}
Thanks for the help. I'd like to try to get a solution somewhat similar to this in hopes I can understand it.
Could do:
library(tidyverse)
df %>%
gather(key, val, ttotal:t09) %>%
left_join(df.key %>% select(3:4), by = c("val" = "class")) %>%
spread(key, mid) %>%
group_by(name) %>%
summarise_all(funs(first(na.omit(.)))) %>%
select(-val)
Output:
# A tibble: 4 x 13
name type place t01 t02 t03 t04 t05 t06 t07 t08 t09 ttotal
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 jack cat SE 7.5 15 3 3 15 0 3 0.5 25 35
2 joe cat SE 7.5 3 3 7.5 3 25 3 0 0.5 45
3 john cat SE 0 0 15 0 7.5 0.5 7.5 0.5 45 3
4 sue cat SE 3 0 25 0 15 0.5 15 7.5 0 0.5
You can simply map your keys into your data:
library(tidyverse)
mutate_at(dat, vars(ttotal:t09), funs(map_dbl(., ~ keys$mid[keys$class == .x])))
Which outputs:
name type place ttotal t01 t02 t03 t04 t05 t06 t07 t08 t09
1 joe cat SE 45.0 7.5 3 3 7.5 3.0 25.0 3.0 0.0 0.5
2 john cat SE 3.0 0.0 0 15 0.0 7.5 0.5 7.5 0.5 45.0
3 sue cat SE 0.5 3.0 0 25 0.0 15.0 0.5 15.0 7.5 0.0
4 jack cat SE 35.0 7.5 15 3 3.0 15.0 0.0 3.0 0.5 25.0
Explanation:
With dplyr::mutate_at() you can change the values of variables you select with vars(ttotal:t09), applying the function funs(...) to each of selected variables. For each variable map_dbl(., ~ keys$mid[keys$class == .x]) compares it with keys$class element-wise (key$class == .x), and subsets keys$mid by resulting Boolean vector.
Your data:
dat <-
structure(
list(
name = c("joe", "john", "sue", "jack"),
type = c("cat",
"cat", "cat", "cat"),
place = c("SE", "SE", "SE", "SE"),
ttotal = c(7L,
2L, 1L, 6L),
t01 = c(3L, 0L, 2L, 3L),
t02 = c(2L, 0L, 0L, 4L),
t03 = c(2L, 4L, 5L, 2L),
t04 = c(3L, 0L, 0L, 2L),
t05 = c(2L,
3L, 4L, 4L),
t06 = c(5L, 1L, 1L, 0L),
t07 = c(2L, 3L, 4L,
2L),
t08 = c(0L, 1L, 3L, 1L),
t09 = c(1L, 7L, 0L, 5L)
),
class = "data.frame",
row.names = c(NA,-4L)
)
keys <-
structure(
list(
lo = c(0L, 0L, 1L, 5L, 10L, 20L, 30L, 40L),
hi = c(0L,
1L, 2L, 10L, 20L, 30L, 40L, 50L),
class = 0:7,
mid = c(0, 0.5,
3, 7.5, 15, 25, 35, 45)
),
class = "data.frame",
row.names = c(NA,-8L)
)
I have a bunch of data frames that look like this in R:
print(output[2])
Button Intensity Acc Intensity RT Time tdelta SubjectID CoupleID PrePost
1: 0 30 0 0.0 0 83325.87 0.000 1531 153 Post
2: 1 30 1 13.5 0 83362.65 36.782 1531 153 Post
3: 1 30 1 15.0 0 83376.68 14.027 1531 153 Post
4: 1 30 1 6.0 0 83392.27 15.585 1531 153 Post
5: 1 30 1 15.0 0 83398.77 6.507 1531 153 Post
print(output[1])
[[1]]
Button Intensity Acc Intensity RT Time tdelta SubjectID CoupleID PrePost
1: 0 30 0 0.0 0 77987.93 0.000 1531 153 Pre
2: 1 30 1 13.5 0 78084.57 96.639 1531 153 Pre
3: 1 30 1 15.0 0 78098.62 14.054 1531 153 Pre
4: 1 30 1 6.0 0 78114.13 15.508 1531 153 Pre
5: 1 30 1 15.0 0 78120.67 6.537 1531 153 Pre
I want to combine them into one big data frame that has the following logic and format:
SubjectID CoupleID PrePost Miss1RT Miss2RT Miss3RT Hit1RT Hit2RT Hit3RT
1531 153 Post 0.00 NA NA NA 36.78 14.027
1531 153 Pre 0.00 NA NA NA 96.638 14.054
if Button == 0, then it's a Miss, if it ==1, then it's a Hit. So, it should be something like:
for row in output[i].rows:
if Button ==0:
Miss1RT ==tdelta
elif Button ==1;
Miss1RT =='NA'
and then a flipped version where if Button is 1, Hit[i]RT is tdelta or else 'NA'.
There are 26 lines per data frame and each row is either a hit or a miss so there will be 26 Miss and 26 Hit columns and each SubjectID gets two rows - one for Pre and one for Post. So the column headers for the final output will be:
SubjectID CoupleID PrePost Miss1RT Miss2RT ...Miss26RT Hit1RT Hit2RT ... Hit26RT
I'm new to R and struggling with the proper syntax.
Something like this should work:
#Get data in structure OP has
output <- list(pre, post)
output2 <- lapply(output, function(x) cbind(x, num = paste0(1:nrow(x), "RT")))
pre_post <- do.call("rbind", output2)
#Perform actual calculations
pre_post$miss <- ifelse(pre_post$Button == 0, pre_post$tdelta, NA)
pre_post$hit <- ifelse(pre_post$Button == 1, pre_post$tdelta, NA)
pre_post_melted <- melt(pre_post, id.vars = c("SubjectID", "CoupleID", "num", "PrePost"), measure.vars = c("hit","miss"))
pre_post_res <- dcast(pre_post_melted, SubjectID + CoupleID + PrePost ~ variable + num, sep = "")
pre_post_res
#SubjectID CoupleID PrePost hit_1RT hit_2RT hit_3RT hit_4RT hit_5RT miss_1RT miss_2RT miss_3RT miss_4RT miss_5RT
#1 1531 153 Post NA 36.782 14.027 15.585 6.507 0 NA NA NA NA
#2 1531 153 Pre NA 96.639 14.054 15.508 6.537 0 NA NA NA NA
We transpose the data to dynamically create all the variables we want. We also stack the data to avoid repeated steps.
Data:
pre <- structure(list(Button = c(0L, 1L, 1L, 1L, 1L), Intensity = c(30L,
30L, 30L, 30L, 30L), Acc = c(0L, 1L, 1L, 1L, 1L), Intensity = c(0,
13.5, 15, 6, 15), RT = c(0L, 0L, 0L, 0L, 0L), Time = c(77987.93,
78084.57, 78098.62, 78114.13, 78120.67), tdelta = c(0, 96.639,
14.054, 15.508, 6.537), SubjectID = c(1531L, 1531L, 1531L, 1531L,
1531L), CoupleID = c(153L, 153L, 153L, 153L, 153L), PrePost = c("Pre",
"Pre", "Pre", "Pre", "Pre")), .Names = c("Button", "Intensity",
"Acc", "Intensity", "RT", "Time", "tdelta", "SubjectID", "CoupleID",
"PrePost"), row.names = c(NA, -5L), class = "data.frame")
post <- structure(list(Button = c(0L, 1L, 1L, 1L, 1L), Intensity = c(30L,
30L, 30L, 30L, 30L), Acc = c(0L, 1L, 1L, 1L, 1L), Intensity = c(0,
13.5, 15, 6, 15), RT = c(0L, 0L, 0L, 0L, 0L), Time = c(83325.87,
83362.65, 83376.68, 83392.27, 83398.77), tdelta = c(0, 36.782,
14.027, 15.585, 6.507), SubjectID = c(1531L, 1531L, 1531L, 1531L,
1531L), CoupleID = c(153L, 153L, 153L, 153L, 153L), PrePost = c("Post",
"Post", "Post", "Post", "Post")), .Names = c("Button", "Intensity",
"Acc", "Intensity", "RT", "Time", "tdelta", "SubjectID", "CoupleID",
"PrePost"), row.names = c(NA, -5L), class = "data.frame")