New column with conditions in tidyverse - r

I want to add a column days to a dataset with some conditions. For each soil there should be nine rows in the days column. The first two rows (0 and 4) should be the value from the SS Period. The value for the days 10-66 should be the N in Period and the ES in Period should be the last days.
This is a very bad explanation I know, but I think perhaps it makes sense by looking at the expected_df dataset.
All help is very much appreciated!
df <- structure(list(soil = c(12L, 5L, 3L, 12L, 5L, 3L, 12L, 3L, 5L
), ITS_1 = c(290900, 16090, 12460, 0, 19700, 25000, 114.2, 39100,
25090), Period = c("ES", "ES", "ES", "N", "N", "N", "SS", "SS",
"SS")), row.names = c(NA, -9L), class = "data.frame")**strong text**
This is how the data should look like
expected_df <- structure(list(soil = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 12L, 12L, 12L, 12L, 12L, 12L,
5L, 5L, 5L), ITS_1 = c(39100, 39100, 25000, 25000, 25000, 25000,
12460, 12460, 12460, 25090, 25090, 19700, 19700, 19700, 19700,
16090, 16090, 16090, 114.2, 114.2, 0, 0, 0, 0, 16090, 16090,
16090), Period = c("SS", "SS", "N", "N", "N", "N", "ES", "ES",
"ES", "SS", "SS", "N", "N", "N", "N", "ES", "ES", "ES", "SS",
"SS", "N", "N", "N", "N", "ES", "ES", "ES"), days = c(0L, 4L,
10L, 17L, 24L, 66L, 81L, 94L, 116L, 0L, 4L, 10L, 17L, 24L, 66L,
81L, 94L, 116L, 0L, 4L, 10L, 17L, 24L, 66L, 81L, 94L, 116L)), class = "data.frame", row.names = c(NA,
-27L))

One solution is to create a dataframe and left_join().
library(dplyr)
df_join <- data.frame(days = c(0, 4, 10, 17, 24, 66, 81, 94, 116),
Period = rep(c("SS", "N", "ES"), times = c(2, 4, 3)))
df %>%
left_join(df_join, by = "Period")
# soil ITS_1 Period days
# 1 12 290900.0 ES 81
# 2 12 290900.0 ES 94
# 3 12 290900.0 ES 116
# 4 5 16090.0 ES 81
# 5 5 16090.0 ES 94
# 6 5 16090.0 ES 116
# 7 3 12460.0 ES 81
# 8 3 12460.0 ES 94
# 9 3 12460.0 ES 116
# 10 12 0.0 N 10
# 11 12 0.0 N 17
# 12 12 0.0 N 24
# 13 12 0.0 N 66
# 14 5 19700.0 N 10
# 15 5 19700.0 N 17
# 16 5 19700.0 N 24
# 17 5 19700.0 N 66
# 18 3 25000.0 N 10
# 19 3 25000.0 N 17
# 20 3 25000.0 N 24
# 21 3 25000.0 N 66
# 22 12 114.2 SS 0
# 23 12 114.2 SS 4
# 24 3 39100.0 SS 0
# 25 3 39100.0 SS 4
# 26 5 25090.0 SS 0
# 27 5 25090.0 SS 4

Related

Analysing the pattern in the growth of organisms using discrete data [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 1 year ago.
This post was edited and submitted for review 1 year ago and failed to reopen the post:
Needs more focus Update the question so it focuses on one problem only by editing this post.
Improve this question
I have data of the size of 200 individuals of a particular plant species. But the size was measured in an indirect way, counting the number of leaves (discrete data), monthly during a total of 14 months. The germination, growth and death of the plants are very irregular, with some plants having a long life span, other dying quickly, and also the germination in time is irregular: new plants kept germinating during all the study period and being incorporated into the study. Here is an example of my data (numbers inside cells refer to the number of leaves):
Month.1
Month.2
Month.3
Month.4
Month.5
Month.6
Month.7
plant.1
3
21
15
-
-
-
-
plant.2
-
7
14
-
-
-
-
plant.3
-
8
12
10
-
-
-
plant.4
-
-
1
3
5
-
-
plant.5
-
3
6
18
13
4
-
.....
...
...
...
...
...
...
...
Following Shibaprasadb's suggestion, here is a subset of my data, already converted to long format:
df <- dput(df)
structure(list(plant = c(1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 28L, 29L, 29L, 29L, 30L, 31L, 32L, 32L,
32L, 32L, 32L, 62L, 62L, 63L, 64L, 65L, 65L, 65L, 65L, 66L, 67L,
67L, 67L), month = c(4L, 5L, 6L, 7L, 8L, 4L, 7L, 8L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 7L, 6L, 7L, 8L, 6L, 7L, 4L, 5L, 6L, 7L, 8L,
5L, 6L, 6L, 6L, 5L, 6L, 7L, 8L, 8L, 2L, 3L, 4L), time = c(0L,
1L, 2L, 3L, 4L, 0L, 0L, 1L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 0L, 0L,
1L, 2L, 0L, 0L, 0L, 1L, 2L, 3L, 4L, 0L, 1L, 0L, 0L, 0L, 1L, 2L,
3L, 0L, 0L, 1L, 2L), leaves = c(6L, 18L, 9L, 24L, 6L, 12L, 6L,
6L, 63L, 66L, 15L, 9L, 15L, 21L, 12L, 12L, 3L, 42L, 12L, 9L,
3L, 15L, 21L, 18L, 27L, 15L, 21L, 36L, 24L, 21L, 3L, 12L, 18L,
6L, 3L, 15L, 3L, 3L), tray = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L), soil_nutrient = c(62.4,
71, 13.6, 37.6, 13.8, 62.4, 37.6, 13.8, 47.6, 51.8, 62.4, 71,
13.6, 37.6, 13.8, 25.4, 14.2, 25.4, 17, 14.2, 25.4, 51.6, 72.6,
14.2, 25.4, 17, 148.2, 29.4, 29.4, 29.4, 148.2, 29.4, 103.6,
122.4, 122.4, 116.2, 141, 117.6), watering = c(81.6, 89, 10.6,
57.8, 1.2, 81.6, 57.8, 1.2, 46, 48.6, 81.6, 89, 10.6, 57.8, 1.2,
57.8, 10.6, 57.8, 1.2, 10.6, 57.8, 81.6, 89, 10.6, 57.8, 1.2,
89, 10.6, 10.6, 10.6, 89, 10.6, 57.8, 1.2, 1.2, 46, 48.6, 81.6
), UV_treatment = c("n", "n", "n", "n", "n", "n", "n", "n", "n",
"n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n",
"n", "n", "n", "n", "n", "y", "y", "y", "n", "y", "n", "n", "n",
"n", "n", "n")), class = "data.frame", row.names = c(NA, -38L
))
df
plant month time leaves tray soil_nutrient watering UV_treatment
1 1 4 0 6 1 62.4 81.6 n
2 1 5 1 18 1 71.0 89.0 n
3 1 6 2 9 1 13.6 10.6 n
4 1 7 3 24 1 37.6 57.8 n
5 1 8 4 6 1 13.8 1.2 n
6 2 4 0 12 1 62.4 81.6 n
7 3 7 0 6 1 37.6 57.8 n
8 3 8 1 6 1 13.8 1.2 n
9 4 2 0 63 1 47.6 46.0 n
10 4 3 1 66 1 51.8 48.6 n
11 4 4 2 15 1 62.4 81.6 n
12 4 5 3 9 1 71.0 89.0 n
13 4 6 4 15 1 13.6 10.6 n
14 4 7 5 21 1 37.6 57.8 n
15 4 8 6 12 1 13.8 1.2 n
16 28 7 0 12 3 25.4 57.8 n
17 29 6 0 3 3 14.2 10.6 n
18 29 7 1 42 3 25.4 57.8 n
19 29 8 2 12 3 17.0 1.2 n
20 30 6 0 9 3 14.2 10.6 n
21 31 7 0 3 3 25.4 57.8 n
22 32 4 0 15 3 51.6 81.6 n
23 32 5 1 21 3 72.6 89.0 n
24 32 6 2 18 3 14.2 10.6 n
25 32 7 3 27 3 25.4 57.8 n
26 32 8 4 15 3 17.0 1.2 n
27 62 5 0 21 7 148.2 89.0 n
28 62 6 1 36 7 29.4 10.6 y
29 63 6 0 24 7 29.4 10.6 y
30 64 6 0 21 7 29.4 10.6 y
31 65 5 0 3 7 148.2 89.0 n
32 65 6 1 12 7 29.4 10.6 y
33 65 7 2 18 7 103.6 57.8 n
34 65 8 3 6 7 122.4 1.2 n
35 66 8 0 3 7 122.4 1.2 n
36 67 2 0 15 7 116.2 46.0 n
37 67 3 1 3 7 141.0 48.6 n
38 67 4 2 3 7 117.6 81.6 n
plant = the identification number of each individual plant.
month = the month in which the plant germinated.
time = similar to month, but each plant starting from zero.
leaves = number of leaves of each indivial plant.
tray = the tray in which each plant was; there were 10 trays in all, each tray containing a different type of soil.
soil_nutrient = the total amount of nutrients of each tray in each month.
watering = the amount of water added to each tray in each month.
UV_treatment = an aggresive UV treatment that we are interested to explore its effect on the plants.
I'm interested in checking if there is some pattern in their growth, I mean if they first show an increase in the number of leaves, if they then get stable, and if they suddenly die or gradually decrease the number of leaves before dying.
I've been looking for phenology analysis in R, but what I've found so far is mainly related to climate change, global parameters, etc. On the other hand, growth analysis considering time series is focused on a long sequence of data of a particular variable, not several individuals.
library(tidyverse)
library(ggpubr)
library(lmerTest)
#> Loading required package: lme4
#> Loading required package: Matrix
#>
#> Attaching package: 'Matrix'
#> The following objects are masked from 'package:tidyr':
#>
#> expand, pack, unpack
#>
#> Attaching package: 'lmerTest'
#> The following object is masked from 'package:lme4':
#>
#> lmer
#> The following object is masked from 'package:stats':
#>
#> step
data <- tibble::tribble(
~Plant, ~Month.1, ~Moßnth.2, ~Month.3, ~Month.4, ~Month.5, ~Month.6, ~Month.7,
"plant.1", "3", "21", "15", "-", "-", "-", "-",
"plant.2", "-", "7", "14", "-", "-", "-", "-",
"plant.3", "-", "8", "12", "10", "-", "-", "-",
"plant.4", "-", "-", "1", "3", "5", "-", "-",
"plant.5", "-", "3", "6", "18", "13", "4", "-"
)
data <-
data %>%
pivot_longer(-Plant, names_to = "Month", values_to = "n_leafs") %>%
mutate(
n_leafs = n_leafs %>% as.numeric(),
Month = Month %>% str_extract("[0-9]+$") %>% as.numeric(),
Plant = Plant %>% str_extract("[0-9]+$") %>% as.numeric()
) %>%
# Normalization: Just count the number of months since seeding for each Plant
group_by(Plant) %>%
mutate(Month = Month - min(Month)) %>%
ungroup()
#> Warning in n_leafs %>% as.numeric(): NAs introduced by coercion
#
# The overall trend is that the number of leaves are getting lesser over time.
# However, this is not significant.
#
data %>%
ggplot(aes(Month, n_leafs, color = Plant)) +
geom_point() +
stat_smooth(method = "lm") +
stat_cor()
#> `geom_smooth()` using formula 'y ~ x'
#> Warning: Removed 19 rows containing non-finite values (stat_smooth).
#> Warning: Removed 19 rows containing non-finite values (stat_cor).
#> Warning: Removed 19 rows containing missing values (geom_point).
#
# The number of leaves are not significantly different between the months
#
data %>%
ggplot(aes(factor(Month), n_leafs)) +
geom_boxplot() +
stat_compare_means()
#> Warning: Removed 19 rows containing non-finite values (stat_boxplot).
#> Warning: Removed 19 rows containing non-finite values (stat_compare_means).
#
# There is still no significance after controlling for the different plants
#
lmer(n_leafs ~ Month + (1|Plant), data = data) %>%
summary()
#> Linear mixed model fit by REML. t-tests use Satterthwaite's method [
#> lmerModLmerTest]
#> Formula: n_leafs ~ Month + (1 | Plant)
#> Data: data
#>
#> REML criterion at convergence: 96.8
#>
#> Scaled residuals:
#> Min 1Q Median 3Q Max
#> -1.1084 -0.7883 -0.2424 0.6603 1.8827
#>
#> Random effects:
#> Groups Name Variance Std.Dev.
#> Plant (Intercept) 4.789 2.188
#> Residual 34.943 5.911
#> Number of obs: 16, groups: Plant, 5
#>
#> Fixed effects:
#> Estimate Std. Error df t value Pr(>|t|)
#> (Intercept) 8.2657 3.2163 9.1434 2.570 0.0298 *
#> Month 0.3186 1.2149 13.4746 0.262 0.7971
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Correlation of Fixed Effects:
#> (Intr)
#> Month -0.831
Created on 2021-09-26 by the reprex package (v2.0.1)

Calculating average pairwise Pearson Correlation Coefficients from Data Frame in R

Suppose I have the following vectors:
IDs_Complex_1 <- c("orangutan", "panda", "sloth", "mountain_gorilla", "dolphin", "snake")
IDs_Complex_2 <- c("bat", "penguin", "goat", "elephant", "tiger")
I would like to calculate the pairwise Pearson Correlation Coefficients between the values in the tissue column taken vertically, for each vector, in the following data frame. I then wish to find the average PCC of all possible combinations.
Complex_ID Tissue_X Tissue_Y Tissue_Z
orangutan 5 6 7
panda 6 7 8
sloth 7 8 9
mountain_gorilla 100 60 50
dolphin 115 62 51
snake 130 59 67
bat 2 6 7
penguin 15 11 12
goat 22 23 86
elephant 14 22 109
tiger 0 1 7
So to illustrate this for complex 1, I wish to calculate:
PCC_1 <- PCC of (5, 6, 7, 100, 115, 130) and (6, 7, 8, 60, 62, 59)
PCC_2 <- PCC of (5, 6, 7, 100, 115, 130) and (7, 8, 9, 50, 51, 67)
PCC_3 <- PCC of (6, 7, 8, 60, 62, 59) and (7, 8, 9, 50, 51, 67)
I Wish to compute the average of
(PCC_1, PCC_2, PCC_3) = ?
But what if I have twenty or so tissue columns where there would be 20!/2!18! = 190 combinations (without repetition) of pairwise correlation coefficients. How would I code that?
Many thanks!
Abigail
If df is your data.frame:
df = structure(list(Complex_ID = structure(c(6L, 7L, 9L, 5L, 2L, 10L,
1L, 8L, 4L, 3L, 11L), .Label = c("bat", "dolphin", "elephant",
"goat", "mountain_gorilla", "orangutan", "panda", "penguin",
"sloth", "snake", "tiger"), class = "factor"), Tissue_X = c(5L,
6L, 7L, 100L, 115L, 130L, 2L, 15L, 22L, 14L, 0L), Tissue_Y = c(6L,
7L, 8L, 60L, 62L, 59L, 6L, 11L, 23L, 22L, 1L), Tissue_Z = c(7L,
8L, 9L, 50L, 51L, 67L, 7L, 12L, 86L, 109L, 7L)), class = "data.frame", row.names = c(NA,
-11L))
You can do:
cor(df[,-1])
Tissue_X Tissue_Y Tissue_Z
Tissue_X 1.0000000 0.9748668 0.4119840
Tissue_Y 0.9748668 1.0000000 0.5440719
Tissue_Z 0.4119840 0.5440719 1.0000000

r - Find corresponding value from multiple columns according to pmin in multiple columns

My df is sth like this:
Item P P1 P2 P3 D1 D2 D3 pmin num NP
A 10 8 11 20 2 1 10 1 D2 11
B 10 8 11 20 2 1 10 1 D2 11
C 10 8 11 20 2 1 10 1 D2 11
D 50 40 35 70 10 15 20 10 D1 40
E 20 15 22 30 5 2 10 2 D2 22
As shown in my df above, I've first calculated D1 and D2, 'pmin' is the parallel min for D1 and D2, 'num' gives the column name(D1 or D2) corresponding to my pmin.
Now what I want is return a new column called 'NP' that gives me the corresponding values in P1 or P2 according to the pmin (by looking across the row). For example, if it says D2 in 'num', looking across the row, I return value from P2, if it says D1 in 'num', I return the value from P1.
Not sure if I explained it nicely but here's how I did for 'pmin' and 'num':
df$pmin = do.call(pmin, df[,5:6] )
df$num = apply(df[,5:6], 1,function(x) names(x)[which.min(x)])
Also in my real dataset, I have P1 through P4 and D1 through D4.
I tried sth like
ifelse( num == 'D1', P1, P2)
but it doesn't work for more than two columns (P1~P4..)
Thanks in advance!!
btw does anyone know how to use
case_when()
from library(dplyr) to get 'NP'?
We can use row/column indexing to extract the elements of 'P1/P2' columns that corresponds to the 'D1', 'D2'
m1 <- cbind(seq_len(nrow(df)), match(df$num, c("D1", "D2", "D3")))
df$NP <- df[c("P1", "P2", "P3")][m1]
df$NP
#[1] 11 11 11 40 22
data
df <- structure(list(Item = c("A", "B", "C", "D", "E"), P = c(10L,
10L, 10L, 50L, 20L), P1 = c(8L, 8L, 8L, 40L, 15L), P2 = c(11L,
11L, 11L, 35L, 22L), P3 = c(20L, 20L, 20L, 70L, 30L), D1 = c(2L,
2L, 2L, 10L, 5L), D2 = c(1L, 1L, 1L, 15L, 2L), D3 = c(10L, 10L,
10L, 20L, 10L), pmin = c(1L, 1L, 1L, 10L, 2L), num = c("D2",
"D2", "D2", "D1", "D2"), NP = c(11L, 11L, 11L, 40L, 22L)),
class = "data.frame", row.names = c(NA,
-5L))

Calculating top 4 of column 1 by column 2 - R

I'm new in R and to be honest don't know how to call what I'm looking for :)
I have data-set "ds" set with 2 columns:
D | res
==========
Ds 20
Dx 23
Dp 1
Ds 12
Ds 23
Ds 54
Dn 65
Ds 122
Dx 11
Dx 154
Dx 18
Do 4
Df 17
Dp 5
Dp 107
Dp 8
Df 3
Dp 33
Dd 223
Dc 7
Dv 22
Du 34
Dh 22
Ds 12
Dy 78
Dd 128
I need to calculate top 4 from column "D" by "Res" so desired result would look like :
D | Res
========
Dd 351
Dp 154
Ds 243
Dx 206
and by %age:
D | % Of Total
==========
Dd 29.10%
Dp 12.77%
Ds 20.15%
Dx 17.08%
Thanks
We can use aggregate() to obtain the sum of each type of "D", and we can introduce a new column to account for the edit of the OP and include also the percentage.
In order to display the result in the desired form, we can apply the order() function to rearrange the rows according to the value of Res. The function rev() in this case ensures that the highest value is put on top, and head() with the parameter 4 displays the first four rows.
summarized <- aggregate(Res ~. , df1, sum)
summarized$Perc <- with(summarized, paste0(round(Res/sum(Res)*100,2),"%"))
head(summarized[rev(order(summarized$Res)),],4)
D Res Perc
2 Dd 351 29.1%
8 Ds 243 20.15%
11 Dx 206 17.08%
7 Dp 154 12.77%
data
df1 <- structure(list(D = structure(c(8L, 11L, 7L, 8L, 8L, 8L, 5L,
8L, 11L, 11L, 11L, 6L, 3L, 7L, 7L, 7L, 3L, 7L, 2L, 1L, 10L, 9L,
4L, 8L, 12L, 2L), .Label = c("Dc", "Dd", "Df", "Dh", "Dn", "Do",
"Dp", "Ds", "Du", "Dv", "Dx", "Dy"), class = "factor"), Res = c(20L,
23L, 1L, 12L, 23L, 54L, 65L, 122L, 11L, 154L, 18L, 4L, 17L, 5L,
107L, 8L, 3L, 33L, 223L, 7L, 22L, 34L, 22L, 12L, 78L, 128L)),
.Names = c("D", "Res"), class = "data.frame", row.names = c(NA, -26L))
If you mean to sum Res per D and then select the top 4 sums (assuming you made mistakes calculating the sums for ds and dp) you could try:
library(dplyr)
df1 %>% mutate(per = Res/sum(Res)) %>% group_by(D) %>% summarise(Res = sum(Res), perc = sum(per)) %>% top_n(4, Res)
Source: local data frame [4 x 3]
D Res perc
(fctr) (int) (dbl)
1 Dd 351 0.2910448
2 Dp 154 0.1276949
3 Ds 243 0.2014925
4 Dx 206 0.1708126
Option using data.table
library(data.table)
out = setorder(setDT(data)[, .(tmp = sum(res)), by = D]
[, .(D, ptg = (tmp/sum(tmp))*100)], -ptg)[1:4,]
#> out
# D ptg
#1: Dd 29.10448
#2: Ds 20.14925
#3: Dx 17.08126
#4: Dp 12.76949

R- merge two dataframes but values of ID have semicolons

This is a followup question to:
R- merge two data frames but some values have semi colon in them
which has been addressed by contributor: agstudy.
The actual data discussed in the link is a bit more complex and i have been stuck for a while.
This is what my dataframe (df2) looks like:
myIDColumn someName somevalue
AB gsdfg 123
CD tfgsdfg 234
EF sfdgsf 365
GH gdfgb 53453
IJ sr 64564
KL sfsdv 4234234
MN ewrwe 5
OP dsfsss 3453
QR gggg 667
ST dss 7567
UV hhhhjf 55
WX dfadasad 8657
YZ ghfgh 1234
ABC gdgfg 234455
VCB hgjkk 5555667
This is what my df1 looks like:
ID someText someThing
AB ada 12
CD;EF;QR dfsdf 13
IJ fgfgd 14
KL fgdg 15
MN gh 16
OP;WX jhjhj 17
WW ghjgjhgjghj 18
YZ kkl 19
This is what i was hoping to get as an output:
I can merge the two well by using:
mm <- merge(df2,df1,by.y='ID',by.x='myIDColumn',all.y=TRUE)
but after that no idea how to proceed further.
Any help is really appreciated. Thanks.
df1:
structure(list(ID = structure(1:8, .Label = c("AB", "CD;EF;QR",
"IJ", "KL", "MN", "OP;WX", "WW", "YZ"), class = "factor"), someText = structure(c(1L,
2L, 4L, 3L, 5L, 7L, 6L, 8L), .Label = c("ada", "dfsdf", "fgdg",
"fgfgd", "gh", "ghjgjhgjghj", "jhjhj", "kkl"), class = "factor"),
someThing = 12:19), .Names = c("ID", "someText", "someThing"
), class = "data.frame", row.names = c(NA, -8L))
df2:
structure(list(myIDColumn = structure(c(1L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 14L, 15L, 2L, 13L), .Label = c("AB", "ABC",
"CD", "EF", "GH", "IJ", "KL", "MN", "OP", "QR", "ST", "UV", "VCB",
"WX", "YZ"), class = "factor"), someName = structure(c(9L, 15L,
12L, 5L, 14L, 13L, 4L, 2L, 7L, 3L, 11L, 1L, 8L, 6L, 10L), .Label = c("dfadasad",
"dsfsss", "dss", "ewrwe", "gdfgb", "gdgfg", "gggg", "ghfgh",
"gsdfg", "hgjkk", "hhhhjf", "sfdgsf", "sfsdv", "sr", "tfgsdfg"
), class = "factor"), somevalue = c(123L, 234L, 365L, 53453L,
64564L, 4234234L, 5L, 3453L, 667L, 7567L, 55L, 8657L, 1234L,
234455L, 5555667L)), .Names = c("myIDColumn", "someName", "somevalue"
), class = "data.frame", row.names = c(NA, -15L))
There are probably better ways to do it but you could create a temporary dataframe:
df1 <- structure(list(ID = c("AB", "CD;EF;QR", "IJ", "KL", "MN", "OP;WX",
"WW", "YZ"), someText = c("ada", "dfsdf", "fgfgd", "fgdg", "gh",
"jhjhj", "ghjgjhgjghj", "kkl"), someThing = 12:19), .Names = c("ID",
"someText", "someThing"), class = "data.frame", row.names = c(NA,
-8L))
df2 <- structure(list(myIDColumn = c("AB", "CD", "EF", "GH", "IJ", "KL",
"MN", "OP", "QR", "ST", "UV", "WX", "YZ", "ABC", "VCB"), someName = c("gsdfg",
"tfgsdfg", "sfdgsf", "gdfgb", "sr", "sfsdv", "ewrwe", "dsfsss",
"gggg", "dss", "hhhhjf", "dfadasad", "ghfgh", "gdgfg", "hgjkk"
), somevalue = c(123L, 234L, 365L, 53453L, 64564L, 4234234L,
5L, 3453L, 667L, 7567L, 55L, 8657L, 1234L, 234455L, 5555667L)), .Names = c("myIDColumn",
"someName", "somevalue"), class = "data.frame", row.names = c(NA,
-15L))
f <- function(x) {
y <- unlist(strsplit(x$ID,';'))
data.frame(ID = x$ID, someText = x$someText, someThing = x$someThing, ID1 = y)
}
library(plyr)
df3 <- ddply(df1, .(ID), f)
> df3
ID someText someThing ID1
1 AB ada 12 AB
2 CD;EF;QR dfsdf 13 CD
3 CD;EF;QR dfsdf 13 EF
4 CD;EF;QR dfsdf 13 QR
5 IJ fgfgd 14 IJ
6 KL fgdg 15 KL
7 MN gh 16 MN
8 OP;WX jhjhj 17 OP
9 OP;WX jhjhj 17 WX
10 WW ghjgjhgjghj 18 WW
11 YZ kkl 19 YZ
You could merge this with your dataframe df2 and summarize the data:
mm <- merge(df2,df3,by.y='ID1',by.x='myIDColumn',all.y=TRUE)
ddply(mm, .(ID,someText, someThing), summarize,
somevalue = paste(somevalue, collapse=','),
someName = paste(someName, collapse = ","))
ID someText someThing somevalue someName
1 AB ada 12 123 gsdfg
2 CD;EF;QR dfsdf 13 234,365,667 tfgsdfg,sfdgsf,gggg
3 IJ fgfgd 14 64564 sr
4 KL fgdg 15 4234234 sfsdv
5 MN gh 16 5 ewrwe
6 OP;WX jhjhj 17 3453,8657 dsfsss,dfadasad
7 WW ghjgjhgjghj 18 NA NA
8 YZ kkl 19 1234 ghfgh

Resources