I have a dataframe with different company IDs appearing from once to over 30 times in different rows. I want to add a new column "di_Flex" and fill it with specific values depending on how often the same company ID appears in a column:
If it appears twice in the column, add the value 6 to the new column "di_Flex",
if it appears 3x, add "8",
if it appears 4x add "10",
if it appears 5x add "12.8",
if it appears 6x add "14.67",
if it appears 7 or more times add "16".
Here is the dataframe:
c(0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 7, 7, 8, 9, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14,
15, 16, 17, 17, 18, 18, 19, 20, 21, 22, 23, 23, 23, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27,
28, 29, 30, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 36,
36, 37, 38, 38, 38, 38, 38, 38, 39, 40, 41, 41, 41, 42, 42, 42,
43, 43, 43, 44, 45, 45, 46, 46, 46, 47, 48, 49, 50, 50, 51, 53,
54, 54, 54, 54, 55, 57, 57, 57, 59, 59, 59, 59, 60, 60, 60, 60,
61, 61, 62, 62, 62, 63, 63, 64, 64, 64, 64, 65, 65, 66, 66, 66,
66, 66, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA)
Thank you for your help!
Assuming your data is called df with a column value:
library(tidyverse)
left_join(df, df %>%
group_by(value) %>%
tally()) %>%
mutate(di_Flex = case_when(n == 2 ~ 6,
n == 3 ~ 8,
n == 4 ~ 10,
n == 5 ~ 12.8,
n == 6 ~ 14.67,
n >= 7 ~ 16)) %>%
select(-n)
This gives us:
1 0 12.8
2 0 12.8
3 0 12.8
4 0 12.8
5 0 12.8
6 1 NA
7 2 NA
8 3 NA
9 4 NA
10 5 8.0
11 5 8.0
12 5 8.0
13 6 16.0
14 6 16.0
15 6 16.0
16 6 16.0
17 6 16.0
18 6 16.0
19 6 16.0
20 6 16.0
Data:
df <- data.frame(value = c(0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 7, 7, 8, 9, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14,
15, 16, 17, 17, 18, 18, 19, 20, 21, 22, 23, 23, 23, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27,
28, 29, 30, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 36,
36, 37, 38, 38, 38, 38, 38, 38, 39, 40, 41, 41, 41, 42, 42, 42,
43, 43, 43, 44, 45, 45, 46, 46, 46, 47, 48, 49, 50, 50, 51, 53,
54, 54, 54, 54, 55, 57, 57, 57, 59, 59, 59, 59, 60, 60, 60, 60,
61, 61, 62, 62, 62, 63, 63, 64, 64, 64, 64, 65, 65, 66, 66, 66,
66, 66, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA))
This question already has answers here:
Can dplyr package be used for conditional mutating?
(5 answers)
Closed 2 years ago.
I want to mutate a column A4 by A3 but reducing value of A3 by 1 if Total == 63. What am I doing wrong here?
tb1 %>%
mutate(A4 = replace(A3, Total == 63, A3-1))
The complete code with data is here
library(tidyverse)
tb1 <-
structure(
list(
A1 = c(16, 11, 16, 18, 20, 19, 16, 18, 20, 15,
17, 19, 19, 19, 16, 19, 16, 15, 19, 19, 16, 18, 18, 19, 19, 18,
20, 18, 19, 19, 19, 19, 17, 19, 17, 16, 18, 19, 16, 18, 17, 19,
19, 20, 17, 16, 18, 16, 15, 19, 19, 17, 20, 18, 16, 19, 19, 15,
17, 17, 19, 19, 16, 17, 18, 19, 17, 19, 17, 15, 19, 16, 17
)
, A2 = c(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
)
, A3 = c(33, 34, 38, 36, 36, 34, 41, 36, 40, 38, 38, 41, 38, 34, 33, 36,
41, 40, 41, 38, 41, 33, 40, 38, 40, 38, 41, 41, 40, 41, 40,
38, 34, 40, 36, 41, 40, 40, 33, 38, 36, 41, 40, 40, 28, 41,
40, 41, 33, 41, 36, 36, 40, 34, 41, 41, 38, 38, 41, 38, 41,
41, 36, 40, 38, 38, 40, 41, 38, 22, 36, 34, 38
)
, Total = c(57, 53, 62, 62, 64, 61, 65, 62, 68, 61, 63, 68, 65, 61, 57, 63,
65, 63, 68, 65, 65, 59, 66, 65, 67, 64, 69, 67, 67, 68, 67,
65, 59, 67, 61, 65, 66, 67, 57, 64, 61, 68, 67, 68, 53, 65,
66, 65, 56, 68, 63, 61, 68, 60, 65, 68, 65, 61, 66, 63, 68,
68, 60, 65, 64, 65, 65, 68, 63, 45, 63, 58, 63
)
)
, class = "data.frame"
, row.names = c(NA, -73L)
)
tb1 %>%
filter(Total == 63)
#> A1 A2 A3 Total
#> 1 17 8 38 63
#> 2 19 8 36 63
#> 3 15 8 40 63
#> 4 19 8 36 63
#> 5 17 8 38 63
#> 6 17 8 38 63
#> 7 19 8 36 63
#> 8 17 8 38 63
tb2 <-
tb1 %>%
mutate(A4 = replace(A3, Total == 63, A3-1)) %>%
mutate(Total = A1 + A2 + A3)
#> Warning: Problem with `mutate()` input `A4`.
#> x number of items to replace is not a multiple of replacement length
#> ℹ Input `A4` is `replace(A3, Total == 63, A3 - 1)`.
tb2 %>%
filter(Total == 62)
#> A1 A2 A3 Total
#> 1 16 8 38 62
#> 2 18 8 36 62
#> 3 18 8 36 62
You are better using ifelse here :
library(dplyr)
tb1 %>% mutate(A4 = ifelse(Total == 63, A3 -1, A3))
As far as why replace does not work if you check the source code of replace :
replace
function (x, list, values)
{
x[list] <- values
x
}
It assigns values to x after subsetting for list.
When you use :
tb1 %>% mutate(A4 = replace(A3, Total == 63, A3-1))
your values is of length length(tb1$A3) but list is of length sum(tb1$Total == 63) which do not match hence you get the warning of number of items to replace is not a multiple of replacement length, since it tries recycling those values but still the length is unequal.
If you want to make replace work you can try :
tb1 %>% mutate(A4 = replace(A3, Total == 63, A3[Total == 63] -1))
but again as I mentioned it is easier to just use ifelse here.
via a program I have received the following pattern count.
Counter({'CCCC': 22115, 'TTTT': 22043, 'AAAA': 22037, 'GGGG': 21930, 'AAAC': 154, 'TTAT': 152, 'CCCA': 152, 'CCTC': 152, 'GGGC': 151, 'TTTG': 150, 'GTGG': 149, 'GCCC': 148, 'CCGC': 145, 'CGGG': 145, 'TGGG': 144, 'AGAA': 144, 'TTGT': 144, 'GAAA': 142, 'CCCG': 142, 'CCCT': 142, 'TCCC': 141, 'CAAA': 139, 'ATTT': 137, 'CGCC': 134, 'GGTG': 133, 'GAGG': 133, 'TTTA': 132, 'CTTT': 131, 'TCTT': 131, 'ACCC': 130, 'AGGG': 130, 'GGAG': 129, 'AACA': 129, 'TAAA': 129, 'TATT': 128, 'TTTC': 128, 'AAGA': 127, 'GGGA': 126, 'ACAA': 126, 'TTCT': 125, 'CTCC': 124, 'GCGG': 124, 'ATAA': 123, 'GGCG': 120, 'CACC': 119, 'AAAT': 118, 'AATA': 117, 'AAAG': 114, 'GTTT': 114, 'TGTT': 112, 'GGGT': 112, 'CCAC': 110, 'CGCG': 45, 'AACC': 43, 'TTAA': 41, 'CTCT': 41, 'GGCC': 41, 'ACTC': 40, 'CTTC': 40, 'GCCG': 39, 'ATTA': 39, 'ACCT': 39, 'TGCG': 39, 'ATAT': 39, 'TCTC': 38, 'ACGG': 38, 'TATA': 37, 'ATCA': 37, 'CGGC': 37, 'CGAG': 36, 'AGAG': 36, 'GACA': 35, 'GTTG': 35, 'TGAG': 35, 'TGGT': 35, 'CCAA': 35, 'TTGG': 34, 'GTGT': 34, 'GCGC': 34, 'CACA': 34, 'GTAA': 34, 'GTAG': 34, 'TCCA': 34, 'TCCT': 34, 'AAGG': 34, 'GAGA': 34, 'GCTT': 34, 'GTGC': 33, 'CTAT': 33, 'TTGC': 33, 'CGGA': 33, 'AGGA': 32, 'GACG': 32, 'AATT': 32, 'CAAC': 32, 'CTGC': 32, 'CTAC': 32, 'ACGA': 32, 'CGAC': 32, 'CCGG': 32, 'TCTG': 32, 'GGAA': 32, 'GGAT': 32, 'TGCT': 32, 'TTAG': 32, 'GCTG': 32, 'GAGT': 31, 'AGGC': 31, 'TTCC': 31, 'ATGA': 31, 'TTCA': 31, 'CCAT': 31, 'AAGT': 31, 'GAGC': 31, 'GTAT': 31, 'CGAA': 31, 'TCAT': 31, 'ATTC': 31, 'TGTG': 30, 'AGTT': 30, 'ATCC': 30, 'AGCA': 30, 'GTCT': 30, 'TGTC': 30, 'TCAC': 30, 'CACT': 30, 'ACTA': 30, 'TAAT': 30, 'CCGT': 30, 'CCTA': 29, 'TCGG': 29, 'GGTA': 29, 'TATG': 29, 'AACG': 29, 'CACG': 29, 'GATT': 29, 'ATCT': 29, 'TGGC': 29, 'AGCC': 29, 'TATC': 29, 'GCTC': 29, 'GGCT': 29, 'TCTA': 29, 'AACT': 28, 'CCTT': 28, 'CTTA': 28, 'TGTA': 28, 'TAGT': 28, 'AGTG': 28, 'CCGA': 27, 'AATG': 27, 'CCTG': 27, 'CTGT': 27, 'AGTC': 27, 'GTCC': 27, 'GGTT': 27, 'ACAC': 26, 'TACC': 26, 'CATC': 26, 'CATA': 26, 'GTGA': 26, 'TGAA': 26, 'GGTC': 26, 'CTTG': 26, 'GCAC': 26, 'GGCA': 26, 'CGTC': 26, 'CTGG': 26, 'TAAG': 26, 'TCGT': 26, 'TGAT': 25, 'CAGA': 25, 'GAAC': 25, 'ACCA': 25, 'TTAC': 25, 'CATT': 25, 'AGAT': 25, 'CGGT': 25, 'ATTG': 25, 'TTGA': 25, 'GATA': 24, 'GGAC': 24, 'AAGC': 24, 'GTCA': 24, 'CAAT': 24, 'GCAG': 24, 'ACAT': 24, 'TGCC': 24, 'ATAG': 24, 'CGTG': 24, 'CGCA': 24, 'TAGG': 23, 'ACCG': 23, 'TTCG': 23, 'AGCG': 23, 'GTTC': 23, 'ACTT': 23, 'CGTT': 23, 'AGAC': 23, 'GCAT': 22, 'TCCG': 22, 'TAAC': 22, 'ACGC': 22, 'CAGC': 22, 'GACC': 22, 'CATG': 22, 'TCGA': 22, 'TAGA': 22, 'GCAA': 22, 'CTCG': 22, 'TACT': 22, 'AATC': 21, 'CGCT': 21, 'GAAT': 21, 'GCGT': 21, 'AGTA': 21, 'GCCA': 21, 'ATGG': 21, 'TCAA': 21, 'CTCA': 21, 'TGGA': 20, 'GAAG': 20, 'GATC': 20, 'TGCA': 20, 'GCCT': 19, 'GTCG': 19, 'CAAG': 19, 'TCGC': 19, 'CTGA': 19, 'GATG': 19, 'CTAA': 19, 'GCGA': 19, 'ATAC': 18, 'GTTA': 18, 'GCTA': 18, 'AGGT': 18, 'CCAG': 18, 'ACAG': 18, 'CTAG': 17, 'CGTA': 17, 'ACGT': 17, 'TACA': 17, 'AGCT': 16, 'CAGG': 16, 'ATGT': 16, 'ATCG': 16, 'ATGC': 15, 'TGAC': 14, 'TAGC': 14, 'ACTG': 14, 'TCAG': 14, 'CGAT': 14, 'TACG': 13, 'CAGT': 11, 'GTAC': 10, 'GACT': 9})
I want to convert it now as a list, so that in the first column "AAAA" there are all corresponding values and so also for all combinations. Does anyone have an idea how to program this well?
This is how I read the data into R:
daten <- read.table("/PATTERN.txt", header = FALSE, sep = "\t");
So far I've tried direct reading, but somehow it doesn't really work. It should look like this:
AAAA CCCC
1 22128 22127
Thank you very much!
If Lines shown reproducibly in the Note at the end contains the data then in it replace Counter( with [, ) with ] and ' with " and read that in using fromJSON:
library(jsonlite)
fromJSON(gsub("'", '"',
sub("\\)", "]",
sub("Counter.","[", Lines))))
giving:
CCCC TTTT AAAA GGGG AAAC TTAT CCCA CCTC GGGC TTTG GTGG GCCC CCGC CGGG
1 22115 22043 22037 21930 154 152 152 152 151 150 149 148 145 145
TGGG AGAA TTGT GAAA CCCG CCCT TCCC CAAA ATTT CGCC GGTG GAGG TTTA CTTT TCTT
1 144 144 144 142 142 142 141 139 137 134 133 133 132 131 131
ACCC AGGG GGAG AACA TAAA TATT TTTC AAGA GGGA ACAA TTCT CTCC GCGG ATAA GGCG
1 130 130 129 129 129 128 128 127 126 126 125 124 124 123 120
CACC AAAT AATA AAAG GTTT TGTT GGGT CCAC CGCG AACC TTAA CTCT GGCC ACTC CTTC
1 119 118 117 114 114 112 112 110 45 43 41 41 41 40 40
GCCG ATTA ACCT TGCG ATAT TCTC ACGG TATA ATCA CGGC CGAG AGAG GACA GTTG TGAG
1 39 39 39 39 39 38 38 37 37 37 36 36 35 35 35
TGGT CCAA TTGG GTGT GCGC CACA GTAA GTAG TCCA TCCT AAGG GAGA GCTT GTGC CTAT
1 35 35 34 34 34 34 34 34 34 34 34 34 34 33 33
TTGC CGGA AGGA GACG AATT CAAC CTGC CTAC ACGA CGAC CCGG TCTG GGAA GGAT TGCT
1 33 33 32 32 32 32 32 32 32 32 32 32 32 32 32
TTAG GCTG GAGT AGGC TTCC ATGA TTCA CCAT AAGT GAGC GTAT CGAA TCAT ATTC TGTG
1 32 32 31 31 31 31 31 31 31 31 31 31 31 31 30
AGTT ATCC AGCA GTCT TGTC TCAC CACT ACTA TAAT CCGT CCTA TCGG GGTA TATG AACG
1 30 30 30 30 30 30 30 30 30 30 29 29 29 29 29
CACG GATT ATCT TGGC AGCC TATC GCTC GGCT TCTA AACT CCTT CTTA TGTA TAGT AGTG
1 29 29 29 29 29 29 29 29 29 28 28 28 28 28 28
CCGA AATG CCTG CTGT AGTC GTCC GGTT ACAC TACC CATC CATA GTGA TGAA GGTC CTTG
1 27 27 27 27 27 27 27 26 26 26 26 26 26 26 26
GCAC GGCA CGTC CTGG TAAG TCGT TGAT CAGA GAAC ACCA TTAC CATT AGAT CGGT ATTG
1 26 26 26 26 26 26 25 25 25 25 25 25 25 25 25
TTGA GATA GGAC AAGC GTCA CAAT GCAG ACAT TGCC ATAG CGTG CGCA TAGG ACCG TTCG
1 25 24 24 24 24 24 24 24 24 24 24 24 23 23 23
AGCG GTTC ACTT CGTT AGAC GCAT TCCG TAAC ACGC CAGC GACC CATG TCGA TAGA GCAA
1 23 23 23 23 23 22 22 22 22 22 22 22 22 22 22
CTCG TACT AATC CGCT GAAT GCGT AGTA GCCA ATGG TCAA CTCA TGGA GAAG GATC TGCA
1 22 22 21 21 21 21 21 21 21 21 21 20 20 20 20
GCCT GTCG CAAG TCGC CTGA GATG CTAA GCGA ATAC GTTA GCTA AGGT CCAG ACAG CTAG
1 19 19 19 19 19 19 19 19 18 18 18 18 18 18 17
CGTA ACGT TACA AGCT CAGG ATGT ATCG ATGC TGAC TAGC ACTG TCAG CGAT TACG CAGT
1 17 17 17 16 16 16 16 15 14 14 14 14 14 13 11
GTAC GACT
1 10 9
Note
Lines <- "
Counter({'CCCC': 22115, 'TTTT': 22043, 'AAAA': 22037, 'GGGG':21930, 'AAAC': 154, 'TTAT': 152, 'CCCA': 152, 'CCTC': 152, 'GGGC': 151, 'TTTG': 150, 'GTGG': 149, 'GCCC': 148, 'CCGC': 145, 'CGGG': 145, 'TGGG': 144, 'AGAA': 144, 'TTGT': 144, 'GAAA': 142, 'CCCG': 142, 'CCCT': 142, 'TCCC': 141, 'CAAA': 139, 'ATTT': 137, 'CGCC': 134, 'GGTG': 133, 'GAGG': 133, 'TTTA': 132, 'CTTT': 131, 'TCTT': 131, 'ACCC': 130, 'AGGG': 130, 'GGAG': 129, 'AACA': 129, 'TAAA': 129, 'TATT': 128, 'TTTC': 128, 'AAGA': 127, 'GGGA': 126, 'ACAA': 126, 'TTCT': 125, 'CTCC': 124, 'GCGG': 124, 'ATAA': 123, 'GGCG': 120, 'CACC': 119, 'AAAT': 118, 'AATA': 117, 'AAAG': 114, 'GTTT': 114, 'TGTT': 112, 'GGGT': 112, 'CCAC': 110, 'CGCG': 45, 'AACC': 43, 'TTAA': 41, 'CTCT': 41, 'GGCC': 41, 'ACTC': 40, 'CTTC': 40, 'GCCG': 39, 'ATTA': 39, 'ACCT': 39, 'TGCG': 39, 'ATAT': 39, 'TCTC': 38, 'ACGG': 38, 'TATA': 37, 'ATCA': 37, 'CGGC': 37, 'CGAG': 36, 'AGAG': 36, 'GACA': 35, 'GTTG': 35, 'TGAG': 35, 'TGGT': 35, 'CCAA': 35, 'TTGG': 34, 'GTGT': 34, 'GCGC': 34, 'CACA': 34, 'GTAA': 34, 'GTAG': 34, 'TCCA': 34, 'TCCT': 34, 'AAGG': 34, 'GAGA': 34, 'GCTT': 34, 'GTGC': 33, 'CTAT': 33, 'TTGC': 33, 'CGGA': 33, 'AGGA': 32, 'GACG': 32, 'AATT': 32, 'CAAC': 32, 'CTGC': 32, 'CTAC': 32, 'ACGA': 32, 'CGAC': 32, 'CCGG': 32, 'TCTG': 32, 'GGAA': 32, 'GGAT': 32, 'TGCT': 32, 'TTAG': 32, 'GCTG': 32, 'GAGT': 31, 'AGGC': 31, 'TTCC': 31, 'ATGA': 31, 'TTCA': 31, 'CCAT': 31, 'AAGT': 31, 'GAGC': 31, 'GTAT': 31, 'CGAA': 31, 'TCAT': 31, 'ATTC': 31, 'TGTG': 30, 'AGTT': 30, 'ATCC': 30, 'AGCA': 30, 'GTCT': 30, 'TGTC': 30, 'TCAC': 30, 'CACT': 30, 'ACTA': 30, 'TAAT': 30, 'CCGT': 30, 'CCTA': 29, 'TCGG': 29, 'GGTA': 29, 'TATG': 29, 'AACG': 29, 'CACG': 29, 'GATT': 29, 'ATCT': 29, 'TGGC': 29, 'AGCC': 29, 'TATC': 29, 'GCTC': 29, 'GGCT': 29, 'TCTA': 29, 'AACT': 28, 'CCTT': 28, 'CTTA': 28, 'TGTA': 28, 'TAGT': 28, 'AGTG': 28, 'CCGA': 27, 'AATG': 27, 'CCTG': 27, 'CTGT': 27, 'AGTC': 27, 'GTCC': 27, 'GGTT': 27, 'ACAC': 26, 'TACC': 26, 'CATC': 26, 'CATA': 26, 'GTGA': 26, 'TGAA': 26, 'GGTC': 26, 'CTTG': 26, 'GCAC': 26, 'GGCA': 26, 'CGTC': 26, 'CTGG': 26, 'TAAG': 26, 'TCGT': 26, 'TGAT': 25, 'CAGA': 25, 'GAAC': 25, 'ACCA': 25, 'TTAC': 25, 'CATT': 25, 'AGAT': 25, 'CGGT': 25, 'ATTG': 25, 'TTGA': 25, 'GATA': 24, 'GGAC': 24, 'AAGC': 24, 'GTCA': 24, 'CAAT': 24, 'GCAG': 24, 'ACAT': 24, 'TGCC': 24, 'ATAG': 24, 'CGTG': 24, 'CGCA': 24, 'TAGG': 23, 'ACCG': 23, 'TTCG': 23, 'AGCG': 23, 'GTTC': 23, 'ACTT': 23, 'CGTT': 23, 'AGAC': 23, 'GCAT': 22, 'TCCG': 22, 'TAAC': 22, 'ACGC': 22, 'CAGC': 22, 'GACC': 22, 'CATG': 22, 'TCGA': 22, 'TAGA': 22, 'GCAA': 22, 'CTCG': 22, 'TACT': 22, 'AATC': 21, 'CGCT': 21, 'GAAT': 21, 'GCGT': 21, 'AGTA': 21, 'GCCA': 21, 'ATGG': 21, 'TCAA': 21, 'CTCA': 21, 'TGGA': 20, 'GAAG': 20, 'GATC': 20, 'TGCA': 20, 'GCCT': 19, 'GTCG': 19, 'CAAG': 19, 'TCGC': 19, 'CTGA': 19, 'GATG': 19, 'CTAA': 19, 'GCGA': 19, 'ATAC': 18, 'GTTA': 18, 'GCTA': 18, 'AGGT': 18, 'CCAG': 18, 'ACAG': 18, 'CTAG': 17, 'CGTA': 17, 'ACGT': 17, 'TACA': 17, 'AGCT': 16, 'CAGG': 16, 'ATGT': 16, 'ATCG': 16, 'ATGC': 15, 'TGAC': 14, 'TAGC': 14, 'ACTG': 14, 'TCAG': 14, 'CGAT': 14, 'TACG': 13, 'CAGT': 11, 'GTAC': 10, 'GACT': 9})"
This answer may help you in this particular case, but you should insist that whoever produced that result to export in such a way that can be easily imported with every programming language. Here you have a string representation of a python object which is definitely not a good way for exchanging data.
However, you can try this:
#place here the correct path to the file
fn <- "pattern.txt"
#here we read the content of the file as is
filecontent <- readChar(fn,file.info(fn)$size)
#we manipulate the string a bit to have an R list
res <- eval(parse(text = gsub("[\\{\\}\n]", "",
gsub(":", "=", sub("Counter", "list", filecontent)))))
I got a nested list, named mylist which has length 4.
Each element of this list is an experiment: exp1.1, exp1.2, exp2.1 and exp2.2.
Each experiment contains observations of length (in days) of four plant growth stages: EM-V6 V6-R0 R0-R4 and R4-R9.
Each growth stage is organized as a data frame with year and mean.
Here is the complete data:
mylist=structure(list(exp1.1 = structure(list(`EM-V6` = structure(list(
year = 2011:2100, mean = c(34, 34, 32, 28, 25, 32, 32, 28,
27, 30, 32, 31, 33, 28, 26, 31, 33, 27, 34, 26, 28, 27, 27,
30, 29, 31, 34, 30, 26, 31, 33, 33, 27, 30, 28, 32, 31, 29,
32, 31, 25, 28, 28, 26, 32, 29, 26, 31, 28, 29, 30, 25, 27,
32, 27, 28, 28, 30, 24, 30, 29, 29, 29, 28, 26, 28, 26, 26,
28, 31, 30, 27, 26, 28, 25, 24, 24, 30, 27, 26, 26, 27, 26,
26, 24, 26, 28, 25, 30, 26)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame"), `V6-R0` = structure(list(year = 2011:2100,
mean = c(30, 33, 33, 32, 29, 30, 32, 31, 32, 30, 33, 30,
32, 33, 33, 32, 29, 31, 32, 28, 31, 29, 36, 29, 30, 30, 33,
31, 33, 30, 34, 32, 29, 31, 28, 30, 30, 29, 34, 31, 32, 31,
30, 28, 32, 29, 29, 32, 29, 28, 29, 29, 32, 31, 27, 32, 29,
31, 29, 29, 30, 29, 29, 29, 28, 28, 30, 30, 30, 32, 29, 29,
30, 29, 29, 29, 28, 28, 29, 30, 29, 29, 29, 30, 28, 30, 30,
29, 29, 29)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame"), `R0-R4` = structure(list(year = 2011:2100,
mean = c(31, 32, 32, 33, 32, 32, 33, 31, 34, 32, 33, 33,
32, 31, 33, 31, 32, 32, 32, 30, 32, 31, 34, 30, 31, 32, 34,
33, 34, 32, 36, 33, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32,
31, 30, 30, 31, 32, 32, 30, 30, 32, 31, 31, 32, 30, 32, 29,
32, 31, 30, 32, 30, 30, 31, 32, 30, 31, 30, 31, 32, 31, 31,
30, 30, 30, 31, 30, 30, 31, 30, 31, 30, 30, 30, 31, 32, 30,
31, 30, 30)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame"), `R4-R9` = structure(list(year = 2011:2100,
mean = c(27, 29, 28, 28, 27, 30, 29, 27, 30, 26, 30, 28,
29, 28, 29, 27, 29, 28, 25, 26, 26, 25, 27, 27, 27, 28, 30,
28, 29, 27, 29, 28, 29, 28, 26, 26, 28, 28, 30, 28, 27, 25,
26, 25, 25, 26, 26, 27, 25, 25, 26, 25, 27, 28, 24, 27, 25,
28, 26, 24, 27, 26, 27, 25, 26, 26, 24, 26, 25, 26, 24, 25,
25, 26, 26, 25, 25, 25, 25, 25, 26, 25, 25, 25, 25, 26, 26,
26, 25, 24)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame")), .Names = c("EM-V6", "V6-R0", "R0-R4",
"R4-R9")), exp1.2 = structure(list(`EM-V6` = structure(list(year = 2011:2100,
mean = c(34, 34, 32, 28, 25, 32, 32, 28, 27, 30, 32, 31,
33, 28, 26, 31, 33, 27, 34, 26, 28, 27, 27, 30, 29, 31, 34,
30, 26, 31, 33, 33, 27, 30, 28, 32, 31, 29, 32, 31, 25, 28,
28, 26, 32, 29, 26, 31, 28, 29, 30, 25, 27, 32, 27, 28, 28,
30, 24, 30, 29, 29, 29, 28, 26, 28, 26, 26, 28, 31, 30, 27,
26, 28, 25, 24, 24, 30, 27, 26, 26, 27, 26, 26, 24, 26, 28,
25, 30, 26)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame"), `V6-R0` = structure(list(year = 2011:2100,
mean = c(30, 33, 33, 32, 29, 30, 32, 31, 32, 30, 33, 30,
32, 33, 33, 32, 29, 31, 32, 28, 31, 29, 36, 29, 30, 30, 33,
31, 33, 30, 34, 32, 29, 31, 28, 30, 30, 29, 34, 31, 32, 31,
30, 28, 32, 29, 29, 32, 29, 28, 29, 29, 32, 31, 27, 32, 29,
31, 29, 29, 30, 29, 29, 29, 28, 28, 30, 30, 30, 32, 29, 29,
30, 29, 29, 29, 28, 28, 29, 30, 29, 29, 29, 30, 28, 30, 30,
29, 29, 29)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame"), `R0-R4` = structure(list(year = 2011:2100,
mean = c(31, 32, 32, 33, 32, 32, 33, 31, 34, 32, 33, 33,
32, 31, 33, 31, 32, 32, 32, 30, 32, 31, 34, 30, 31, 32, 34,
33, 34, 32, 36, 33, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32,
31, 30, 30, 31, 32, 32, 30, 30, 32, 31, 31, 32, 30, 32, 29,
32, 31, 30, 32, 30, 30, 31, 32, 30, 31, 30, 31, 32, 31, 31,
30, 30, 30, 31, 30, 30, 31, 30, 31, 30, 30, 30, 31, 32, 30,
31, 30, 30)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame"), `R4-R9` = structure(list(year = 2011:2100,
mean = c(27, 29, 28, 28, 27, 30, 29, 27, 30, 26, 30, 28,
29, 28, 29, 27, 29, 28, 25, 26, 26, 25, 27, 27, 27, 28, 30,
28, 29, 27, 29, 28, 29, 28, 26, 26, 28, 28, 30, 28, 27, 25,
26, 25, 25, 26, 26, 27, 25, 25, 26, 25, 27, 28, 24, 27, 25,
28, 26, 24, 27, 26, 27, 25, 26, 26, 24, 26, 25, 26, 24, 25,
25, 26, 26, 25, 25, 25, 25, 25, 26, 25, 25, 25, 25, 26, 26,
26, 25, 24)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame")), .Names = c("EM-V6", "V6-R0", "R0-R4",
"R4-R9")), exp2.1 = structure(list(`EM-V6` = structure(list(year = 2011:2100,
mean = c(34, 34, 32, 28, 25, 32, 32, 28, 27, 30, 32, 31,
33, 28, 26, 31, 33, 27, 34, 26, 28, 27, 27, 30, 29, 31, 34,
30, 26, 31, 33, 33, 27, 30, 28, 32, 31, 29, 32, 31, 25, 28,
28, 26, 32, 29, 26, 31, 28, 29, 30, 25, 27, 32, 27, 28, 28,
30, 24, 30, 29, 29, 29, 28, 26, 28, 26, 26, 28, 31, 30, 27,
26, 28, 25, 24, 24, 30, 27, 26, 26, 27, 26, 26, 24, 26, 28,
25, 30, 26)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame"), `V6-R0` = structure(list(year = 2011:2100,
mean = c(30, 33, 33, 32, 29, 30, 32, 31, 32, 30, 33, 30,
32, 33, 33, 32, 29, 31, 32, 28, 31, 29, 36, 29, 30, 30, 33,
31, 33, 30, 34, 32, 29, 31, 28, 30, 30, 29, 34, 31, 32, 31,
30, 28, 32, 29, 29, 32, 29, 28, 29, 29, 32, 31, 27, 32, 29,
31, 29, 29, 30, 29, 29, 29, 28, 28, 30, 30, 30, 32, 29, 29,
30, 29, 29, 29, 28, 28, 29, 30, 29, 29, 29, 30, 28, 30, 30,
29, 29, 29)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame"), `R0-R4` = structure(list(year = 2011:2100,
mean = c(31, 32, 32, 33, 32, 32, 33, 31, 34, 32, 33, 33,
32, 31, 33, 31, 32, 32, 32, 30, 32, 31, 34, 30, 31, 32, 34,
33, 34, 32, 36, 33, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32,
31, 30, 30, 31, 32, 32, 30, 30, 32, 31, 31, 32, 30, 32, 29,
32, 31, 30, 32, 30, 30, 31, 32, 30, 31, 30, 31, 32, 31, 31,
30, 30, 30, 31, 30, 30, 31, 30, 31, 30, 30, 30, 31, 32, 30,
31, 30, 30)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame"), `R4-R9` = structure(list(year = 2011:2100,
mean = c(27, 29, 28, 28, 27, 30, 29, 27, 30, 26, 30, 28,
29, 28, 29, 27, 29, 28, 25, 26, 26, 25, 27, 27, 27, 28, 30,
28, 29, 27, 29, 28, 29, 28, 26, 26, 28, 28, 30, 28, 27, 25,
26, 25, 25, 26, 26, 27, 25, 25, 26, 25, 27, 28, 24, 27, 25,
28, 26, 24, 27, 26, 27, 25, 26, 26, 24, 26, 25, 26, 24, 25,
25, 26, 26, 25, 25, 25, 25, 25, 26, 25, 25, 25, 25, 26, 26,
26, 25, 24)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame")), .Names = c("EM-V6", "V6-R0", "R0-R4",
"R4-R9")), exp2.2 = structure(list(`EM-V6` = structure(list(year = 2011:2100,
mean = c(34, 34, 32, 28, 25, 32, 32, 28, 27, 30, 32, 31,
33, 28, 26, 31, 33, 27, 34, 26, 28, 27, 27, 30, 29, 31, 34,
30, 26, 31, 33, 33, 27, 30, 28, 32, 31, 29, 32, 31, 25, 28,
28, 26, 32, 29, 26, 31, 28, 29, 30, 25, 27, 32, 27, 28, 28,
30, 24, 30, 29, 29, 29, 28, 26, 28, 26, 26, 28, 31, 30, 27,
26, 28, 25, 24, 24, 30, 27, 26, 26, 27, 26, 26, 24, 26, 28,
25, 30, 26)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame"), `V6-R0` = structure(list(year = 2011:2100,
mean = c(30, 33, 33, 32, 29, 30, 32, 31, 32, 30, 33, 30,
32, 33, 33, 32, 29, 31, 32, 28, 31, 29, 36, 29, 30, 30, 33,
31, 33, 30, 34, 32, 29, 31, 28, 30, 30, 29, 34, 31, 32, 31,
30, 28, 32, 29, 29, 32, 29, 28, 29, 29, 32, 31, 27, 32, 29,
31, 29, 29, 30, 29, 29, 29, 28, 28, 30, 30, 30, 32, 29, 29,
30, 29, 29, 29, 28, 28, 29, 30, 29, 29, 29, 30, 28, 30, 30,
29, 29, 29)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame"), `R0-R4` = structure(list(year = 2011:2100,
mean = c(31, 32, 32, 33, 32, 32, 33, 31, 34, 32, 33, 33,
32, 31, 33, 31, 32, 32, 32, 30, 32, 31, 34, 30, 31, 32, 34,
33, 34, 32, 36, 33, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32,
31, 30, 30, 31, 32, 32, 30, 30, 32, 31, 31, 32, 30, 32, 29,
32, 31, 30, 32, 30, 30, 31, 32, 30, 31, 30, 31, 32, 31, 31,
30, 30, 30, 31, 30, 30, 31, 30, 31, 30, 30, 30, 31, 32, 30,
31, 30, 30)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame"), `R4-R9` = structure(list(year = 2011:2100,
mean = c(27, 29, 28, 28, 27, 30, 29, 27, 30, 26, 30, 28,
29, 28, 29, 27, 29, 28, 25, 26, 26, 25, 27, 27, 27, 28, 30,
28, 29, 27, 29, 28, 29, 28, 26, 26, 28, 28, 30, 28, 27, 25,
26, 25, 25, 26, 26, 27, 25, 25, 26, 25, 27, 28, 24, 27, 25,
28, 26, 24, 27, 26, 27, 25, 26, 26, 24, 26, 25, 26, 24, 25,
25, 26, 26, 25, 25, 25, 25, 25, 26, 25, 25, 25, 25, 26, 26,
26, 25, 24)), .Names = c("year", "mean"), row.names = c(NA,
-90L), class = "data.frame")), .Names = c("EM-V6", "V6-R0", "R0-R4",
"R4-R9"))), .Names = c("exp1.1", "exp1.2", "exp2.1", "exp2.2"
))
What I need to do is to "unlist" this nested list to a data frame that will look like this:
YEAR EXP EM-V6 V6-R0 R0-R4 R4-R9
2011 exp1.1 34 30 31 27
2011 exp1.2 34 30 31 27
2011 exp2.1 34 30 31 27
2011 exp1.1 34 30 31 27
Which means:
- first year, first experiment, and growth stages.
- first year, second experiment and growth stages.
- first year, third experiment and growth stages
- first year, fourth experiment and growth stages
- second year, first experiment and growth stages
and so on.
How to perform that data transformation?
An alternative using rbindlist from the data.table-package twice:
library(data.table)
# bind the dataframes in the 'listed lists' together and include the year with the 'id'-parameter
# the resulting 'data.table's are returned as a list
step1 <- lapply(mylist, rbindlist, id = 'stages')
# bind the resulting list together and include the experiment id
step2 <- rbindlist(step1, id = 'experiment')
# reshape to wide format
dcast(step2, year + experiment ~ stages, value.var = 'mean')
Or in one go:
dcast(rbindlist(lapply(mylist, rbindlist, id = 'stages'), id = 'experiment'),
year + experiment ~ stages, value.var = 'mean')
which gives:
year experiment EM-V6 R0-R4 R4-R9 V6-R0
1: 2011 exp1.1 34 31 27 30
2: 2011 exp1.2 34 31 27 30
3: 2011 exp2.1 34 31 27 30
4: 2011 exp2.2 34 31 27 30
5: 2012 exp1.1 34 32 29 33
---
356: 2099 exp2.2 30 30 25 29
357: 2100 exp1.1 26 30 24 29
358: 2100 exp1.2 26 30 24 29
359: 2100 exp2.1 26 30 24 29
360: 2100 exp2.2 26 30 24 29
Alternate tidyverse:
library(tidyverse)
map_df(mylist, ~bind_rows(., .id="id"), .id="EXP") %>%
spread(id, mean)
We can use tidyverse with more compact and readable code
library(dplyr)
library(tidyr)
library(purrr)
res1 <- mylist %>%
#bind the inner datasets and create an id column
map(bind_rows, .id = "id") %>%
#bind the outer datasets and create an EXP column
bind_rows(.id = "EXP") %>%
#reshape to wide format
spread(id, mean)
head(res1, 4)
# EXP year EM-V6 R0-R4 R4-R9 V6-R0
#1 exp1.1 2011 34 31 27 30
#2 exp1.1 2012 34 32 29 33
#3 exp1.1 2013 32 32 28 33
#4 exp1.1 2014 28 33 28 32
Or we can approach this by looping through the mylist with lapply, then create a new column 'name' usign Map by cbinding the names of the inner list elements, then rbind the list elements with do.call(rbind, now do a second Map to create a new column based on the names of 'mylist', rbind the list elements and then reshape from base R to convert it to 'wide'
res <- do.call(rbind, Map(cbind, lapply(mylist, function(x)
do.call(rbind, Map(cbind, x, name = names(x)))), EXP= names(mylist)))
res2 <- reshape(res, idvar = c("year", "EXP"),
timevar = "name", direction = "wide")
row.names(res2) <- NULL
head(res2, 4)
# year EXP mean.EM-V6 mean.V6-R0 mean.R0-R4 mean.R4-R9
#1 2011 exp1.1 34 30 31 27
#2 2012 exp1.1 34 33 32 29
#3 2013 exp1.1 32 33 32 28
#4 2014 exp1.1 28 32 33 28
NOTE: No external packages used (100% base R)
or use dcast from reshape2 to transform to 'wide' format
library(reshape2)
res2 <- dcast(res, year + EXP~name, value.var = "mean")
head(res2, 4)
# year EXP EM-V6 V6-R0 R0-R4 R4-R9
#1 2011 exp1.1 34 30 31 27
#2 2011 exp1.2 34 30 31 27
#3 2011 exp2.1 34 30 31 27
#4 2011 exp2.2 34 30 31 27