how can I reorganize a data based on two column - r

I have a data like below
df<- structure(list(data1 = c(0.013818378, 0.014362551, 0.014647562,
0.0136627, 0.015510173, 0.006818502, 0.006683564, 0.006655434,
0.006691479, 0.00666666, 0.014507653, 0.017446481, 0.014021427,
0.013963069, 0.020706391, 0.007104358, 0.006809539, 0.006680631,
0.009059533, 0.006681197, 0.015691738, 0.016709763, 0.015761994,
0.016062111, 0.015917196, 0.006816436, 0.006809539, 0.006680631,
0.009059533, 0.006681197), data2 = c(0.045378058, 0.041371486,
0.046058451, 0.040479177, 0.051143336, 0.016131932, 0.014399847,
0.014950329, 0.016408355, 0.015886182, 0.046151342, 0.05265521,
0.046046663, 0.040515428, 0.086865434, 0.019222881, 0.016926183,
0.016703444, 0.081352865, 0.132841645, 0.051641343, 0.059851738,
0.04830957, 0.047550067, 0.049228835, 0.015154055, 0.016926183,
0.016703444, 0.081352865, 0.132841645), time = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L
), place = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L), .Label = c("B02", "B03", "B04", "B05",
"B06", "C02", "C03", "C04", "C05", "C06"), class = "factor")), .Names = c("data1",
"data2", "time", "place"), class = "data.frame", row.names = c(NA,
-30L))
It has several data in it and distinguishable by time
I am trying to put separate them and re-orginise them in various data frame
each column except time and place are one data which needs to be organized
for example for data1 at time 1
B 0.013818378 0.014362551 0.014647562 0.0136627 0.015510173
C 0.006818502 0.006683564 0.006655434 0.006691479 0.00666666
data 1 at time 10
B 0.014507653 0.017446481 0.014021427 0.013963069 0.020706391
C 0.007104358 0.006809539 0.006680631 0.009059533 0.006681197
etc etc

We separate the 'place' column into two columns by splitting between the letter and digits, and spread into 'wide' format
library(dplyr)
library(tidyr)
df %>%
separate(place, into = c("grp", "number"), "(?<=[A-Z])(?=[0-9])") %>%
select(-data2) %>%
spread(number, data1)
# time grp 02 03 04 05 06
#1 1 B 0.013818378 0.014362551 0.014647562 0.013662700 0.015510173
#2 1 C 0.006818502 0.006683564 0.006655434 0.006691479 0.006666660
#3 10 B 0.014507653 0.017446481 0.014021427 0.013963069 0.020706391
#4 10 C 0.007104358 0.006809539 0.006680631 0.009059533 0.006681197
#5 17 B 0.015691738 0.016709763 0.015761994 0.016062111 0.015917196
#6 17 C 0.006816436 0.006809539 0.006680631 0.009059533 0.006681197
If we want as a list of datasets of both 'data1' and 'data2'
nm1 <- grep("data", names(df), value = TRUE)
nm1 %>%
purrr::map(~ df %>%
select(-one_of(nm1), .x) %>%
separate(place, into = c("grp", "number"), "(?<=[A-Z])(?=[0-9])") %>%
spread(number, .x) )
#[[1]]
# time grp 02 03 04 05 06
#1 1 B 0.013818378 0.014362551 0.014647562 0.013662700 0.015510173
#2 1 C 0.006818502 0.006683564 0.006655434 0.006691479 0.006666660
#3 10 B 0.014507653 0.017446481 0.014021427 0.013963069 0.020706391
#4 10 C 0.007104358 0.006809539 0.006680631 0.009059533 0.006681197
#5 17 B 0.015691738 0.016709763 0.015761994 0.016062111 0.015917196
#6 17 C 0.006816436 0.006809539 0.006680631 0.009059533 0.006681197
#[[2]]
# time grp 02 03 04 05 06
#1 1 B 0.04537806 0.04137149 0.04605845 0.04047918 0.05114334
#2 1 C 0.01613193 0.01439985 0.01495033 0.01640835 0.01588618
#3 10 B 0.04615134 0.05265521 0.04604666 0.04051543 0.08686543
#4 10 C 0.01922288 0.01692618 0.01670344 0.08135286 0.13284165
#5 17 B 0.05164134 0.05985174 0.04830957 0.04755007 0.04922883
#6 17 C 0.01515405 0.01692618 0.01670344 0.08135286 0.13284165
It is not clear how the output should look like when we have multiple value columns. The dcast from data.table can deal with multiple value.var columns
library(data.table)
setDT(df)[, c("grp", "number") := tstrsplit(place, "(?<=[A-Z])(?=[0-9])", perl = TRUE)]
dcast(df, grp + time ~ number, value.var = c("data1", "data2"))

It is somewhat unclear from your question, but I think that this is what you want:
library(tidyverse)
df %>%
mutate(
column = str_extract(place, "[0-9]+"),
place = str_extract(place, "[A-Z]")
) %>%
gather(data1, data2, key = "data", value = "val") %>%
spread(column, val) %>%
split(f = .$data)
Which produces the following format:
$data1
time place data 02 03 04 05 06
1 1 B data1 0.013818378 0.014362551 0.014647562 0.013662700 0.015510173
3 1 C data1 0.006818502 0.006683564 0.006655434 0.006691479 0.006666660
5 10 B data1 0.014507653 0.017446481 0.014021427 0.013963069 0.020706391
7 10 C data1 0.007104358 0.006809539 0.006680631 0.009059533 0.006681197
9 17 B data1 0.015691738 0.016709763 0.015761994 0.016062111 0.015917196
11 17 C data1 0.006816436 0.006809539 0.006680631 0.009059533 0.006681197
$data2
time place data 02 03 04 05 06
2 1 B data2 0.04537806 0.04137149 0.04605845 0.04047918 0.05114334
4 1 C data2 0.01613193 0.01439985 0.01495033 0.01640835 0.01588618
6 10 B data2 0.04615134 0.05265521 0.04604666 0.04051543 0.08686543
8 10 C data2 0.01922288 0.01692618 0.01670344 0.08135286 0.13284165
10 17 B data2 0.05164134 0.05985174 0.04830957 0.04755007 0.04922883
12 17 C data2 0.01515405 0.01692618 0.01670344 0.08135286 0.13284165

Related

Assign established values randomly by ID in R

I have this file:
ID P
1 10
1 12
1 11
2 9
2 8
2 10
3 11
3 12
3 14
4 15
4 16
4 8
5 11
5 13
5 10
6 14
6 16
6 11
And I would like to assign these values (a,b,c) randomly to the file:
like this:
ID P Group
1 10 a
1 12 b
1 11 c
2 9 c
2 8 a
2 10 b
3 11 a
3 12 c
3 14 b
4 15 c
4 16 a
4 8 b
5 11 b
5 13 c
5 10 a
6 14 b
6 16 c
6 11 a
I need to do several times, every time randomly. I tried this:
df %>% group_by(ID) %>% replicate(1,sample(df$group))
but, for sure, didnĀ“t work. Some suggestion?
Here is an option with sample
library(dplyr)
df1 %>%
group_by(ID) %>%
mutate(Group = sample(c('a', 'b', 'c'), n(), replace = TRUE))
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), P = c(10L, 12L, 11L, 9L, 8L,
10L, 11L, 12L, 14L, 15L, 16L, 8L, 11L, 13L, 10L, 14L, 16L, 11L
)), class = "data.frame", row.names = c(NA, -18L))
Two solutions, one with grouping, the other without
library(tidyverse)
df <- dplyr::tribble(
~ID, ~P,
1,10,
1,12,
1,11,
2,9,
2,8,
2,10,
3,11,
3,12,
3,14,
4,15,
4,16,
4,8,
5,11,
5,13,
5,10,
6,14,
6,16,
6,11
)
sample_vector <- c("a","b","c")
##Without grouping id
df_2 <- df %>%
mutate(Group = sample(sample_vector, nrow(df), replace = TRUE))
##With grouping by ID
df_2 <- df %>% group_by(ID) %>%
mutate(Group = sample(sample_vector, n(), replace = TRUE))

r - Find corresponding value from multiple columns according to pmin in multiple columns

My df is sth like this:
Item P P1 P2 P3 D1 D2 D3 pmin num NP
A 10 8 11 20 2 1 10 1 D2 11
B 10 8 11 20 2 1 10 1 D2 11
C 10 8 11 20 2 1 10 1 D2 11
D 50 40 35 70 10 15 20 10 D1 40
E 20 15 22 30 5 2 10 2 D2 22
As shown in my df above, I've first calculated D1 and D2, 'pmin' is the parallel min for D1 and D2, 'num' gives the column name(D1 or D2) corresponding to my pmin.
Now what I want is return a new column called 'NP' that gives me the corresponding values in P1 or P2 according to the pmin (by looking across the row). For example, if it says D2 in 'num', looking across the row, I return value from P2, if it says D1 in 'num', I return the value from P1.
Not sure if I explained it nicely but here's how I did for 'pmin' and 'num':
df$pmin = do.call(pmin, df[,5:6] )
df$num = apply(df[,5:6], 1,function(x) names(x)[which.min(x)])
Also in my real dataset, I have P1 through P4 and D1 through D4.
I tried sth like
ifelse( num == 'D1', P1, P2)
but it doesn't work for more than two columns (P1~P4..)
Thanks in advance!!
btw does anyone know how to use
case_when()
from library(dplyr) to get 'NP'?
We can use row/column indexing to extract the elements of 'P1/P2' columns that corresponds to the 'D1', 'D2'
m1 <- cbind(seq_len(nrow(df)), match(df$num, c("D1", "D2", "D3")))
df$NP <- df[c("P1", "P2", "P3")][m1]
df$NP
#[1] 11 11 11 40 22
data
df <- structure(list(Item = c("A", "B", "C", "D", "E"), P = c(10L,
10L, 10L, 50L, 20L), P1 = c(8L, 8L, 8L, 40L, 15L), P2 = c(11L,
11L, 11L, 35L, 22L), P3 = c(20L, 20L, 20L, 70L, 30L), D1 = c(2L,
2L, 2L, 10L, 5L), D2 = c(1L, 1L, 1L, 15L, 2L), D3 = c(10L, 10L,
10L, 20L, 10L), pmin = c(1L, 1L, 1L, 10L, 2L), num = c("D2",
"D2", "D2", "D1", "D2"), NP = c(11L, 11L, 11L, 40L, 22L)),
class = "data.frame", row.names = c(NA,
-5L))

Subsetting a data frame according to recursive rows and creating a column for ordering

Consider the sample data
df <-
structure(
list(
id = c(1L, 1L, 1L, 1L, 2L, 2L, 3L),
A = c(20L, 12L, 13L, 8L, 11L, 21L, 17L),
B = c(1L, 1L, 0L, 0L, 1L, 0L, 0L)
),
.Names = c("id", "A", "B"),
class = "data.frame",
row.names = c(NA,-7L)
)
Each id (stored in column 1) has varying number of entries for column A and B. In the example data, there are four observations with id = 1. I am looking for a way to subset this data in R so that there will be at most 3 entries for for each id and finally create another column (labelled as C) which consists of the order of each id. The expected output would look like:
df <-
structure(
list(
id = c(1L, 1L, 1L, 2L, 2L, 3L),
A = c(20L, 12L, 13L, 11L, 21L, 17L),
B = c(1L, 1L, 0L, 1L, 0L, 0L),
C = c(1L, 2L, 3L, 1L, 2L, 1L)
),
.Names = c("id", "A", "B","C"),
class = "data.frame",
row.names = c(NA,-6L)
)
Your help is much appreciated.
Like this?
library(data.table)
dt <- as.data.table(df)
dt[, C := seq(.N), by = id]
dt <- dt[C <= 3,]
dt
# id A B C
# 1: 1 20 1 1
# 2: 1 12 1 2
# 3: 1 13 0 3
# 4: 2 11 1 1
# 5: 2 21 0 2
# 6: 3 17 0 1
Here is one option with dplyr and considering the top 3 values based on A (based of the comments of #Ronak Shah).
library(dplyr)
df %>%
group_by(id) %>%
top_n(n = 3, wt = A) %>% # top 3 values based on A
mutate(C = rank(id, ties.method = "first")) # C consists of the order of each id
# A tibble: 6 x 4
# Groups: id [3]
id A B C
<int> <int> <int> <int>
1 1 20 1 1
2 1 12 1 2
3 1 13 0 3
4 2 11 1 1
5 2 21 0 2
6 3 17 0 1

how can I add partially related number to a data

I am trying to add numbers to my data which belongs to each data
my data is like
df <- structure(list(data = structure(c(1L, 1L, 1L, 1L, 1L, 3L, 3L,
4L, 4L, 5L, 5L, 6L, 5L, 7L, 7L, 8L, 8L, 2L, 2L, 2L), .Label = c("data1",
"data10", "data2", "data3", "data4", "data5", "data6", "data7"
), class = "factor"), values = structure(c(3L, 8L, 18L, 1L, 15L,
17L, 19L, 7L, 2L, 2L, 11L, 10L, 6L, 4L, 9L, 12L, 14L, 5L, 13L,
16L), .Label = c("112864.443", "11319531", "12874.443", "142983324",
"1612410048", "16349475.63", "184901841", "2223793.8", "30553282.01",
"312004.547", "3135868.44", "317403612.9", "3686081.063", "43701608",
"623793.8", "64959501.42", "67666215", "767666215", "775987137.8"
), class = "factor")), .Names = c("data", "values"), class = "data.frame", row.names = c(NA,
-20L))
I want to have the exact values after each of my first column. since they are not consecutive, I dont know how to add them into a separate column. a desire output should look like below
data values
data1 12874.443 1
data1 2223793.8 1
data1 767666215 1
data1 112864.443 1
data1 623793.8 1
data2 67666215 2
data2 775987137.8 2
data3 184901841 3
data3 11319531 3
data4 11319531 4
data4 3135868.44 4
data5 312004.547 5
data4 16349475.63 4
data6 142983324 6
data6 30553282.01 6
data7 317403612.9 7
data7 43701608 7
data10 1612410048 10
data10 3686081.063 10
data10 64959501.42 10
one way is to use gsub to extract the value and add it as another column
df$label <- gsub("[^[:digit:]]", "", df$data)
another way is to use str_extract thanks to this question R: split character data into numbers and letters
library(stringr)
df$label <- as.numeric(str_extract(df$data, "[0-9]+"))
> df
# data values label
# 1 data1 12874.443 1
# 2 data1 2223793.8 1
# 3 data1 767666215 1
# 4 data1 112864.443 1
# 5 data1 623793.8 1
# 6 data2 67666215 2
# 7 data2 775987137.8 2
# 8 data3 184901841 3
# 9 data3 11319531 3
# 10 data4 11319531 4
# 11 data4 3135868.44 4
# 12 data5 312004.547 5
# 13 data4 16349475.63 4
# 14 data6 142983324 6
# 15 data6 30553282.01 6
# 16 data7 317403612.9 7
# 17 data7 43701608 7
# 18 data10 1612410048 10
# 19 data10 3686081.063 10
# 20 data10 64959501.42 10

Calculating top 4 of column 1 by column 2 - R

I'm new in R and to be honest don't know how to call what I'm looking for :)
I have data-set "ds" set with 2 columns:
D | res
==========
Ds 20
Dx 23
Dp 1
Ds 12
Ds 23
Ds 54
Dn 65
Ds 122
Dx 11
Dx 154
Dx 18
Do 4
Df 17
Dp 5
Dp 107
Dp 8
Df 3
Dp 33
Dd 223
Dc 7
Dv 22
Du 34
Dh 22
Ds 12
Dy 78
Dd 128
I need to calculate top 4 from column "D" by "Res" so desired result would look like :
D | Res
========
Dd 351
Dp 154
Ds 243
Dx 206
and by %age:
D | % Of Total
==========
Dd 29.10%
Dp 12.77%
Ds 20.15%
Dx 17.08%
Thanks
We can use aggregate() to obtain the sum of each type of "D", and we can introduce a new column to account for the edit of the OP and include also the percentage.
In order to display the result in the desired form, we can apply the order() function to rearrange the rows according to the value of Res. The function rev() in this case ensures that the highest value is put on top, and head() with the parameter 4 displays the first four rows.
summarized <- aggregate(Res ~. , df1, sum)
summarized$Perc <- with(summarized, paste0(round(Res/sum(Res)*100,2),"%"))
head(summarized[rev(order(summarized$Res)),],4)
D Res Perc
2 Dd 351 29.1%
8 Ds 243 20.15%
11 Dx 206 17.08%
7 Dp 154 12.77%
data
df1 <- structure(list(D = structure(c(8L, 11L, 7L, 8L, 8L, 8L, 5L,
8L, 11L, 11L, 11L, 6L, 3L, 7L, 7L, 7L, 3L, 7L, 2L, 1L, 10L, 9L,
4L, 8L, 12L, 2L), .Label = c("Dc", "Dd", "Df", "Dh", "Dn", "Do",
"Dp", "Ds", "Du", "Dv", "Dx", "Dy"), class = "factor"), Res = c(20L,
23L, 1L, 12L, 23L, 54L, 65L, 122L, 11L, 154L, 18L, 4L, 17L, 5L,
107L, 8L, 3L, 33L, 223L, 7L, 22L, 34L, 22L, 12L, 78L, 128L)),
.Names = c("D", "Res"), class = "data.frame", row.names = c(NA, -26L))
If you mean to sum Res per D and then select the top 4 sums (assuming you made mistakes calculating the sums for ds and dp) you could try:
library(dplyr)
df1 %>% mutate(per = Res/sum(Res)) %>% group_by(D) %>% summarise(Res = sum(Res), perc = sum(per)) %>% top_n(4, Res)
Source: local data frame [4 x 3]
D Res perc
(fctr) (int) (dbl)
1 Dd 351 0.2910448
2 Dp 154 0.1276949
3 Ds 243 0.2014925
4 Dx 206 0.1708126
Option using data.table
library(data.table)
out = setorder(setDT(data)[, .(tmp = sum(res)), by = D]
[, .(D, ptg = (tmp/sum(tmp))*100)], -ptg)[1:4,]
#> out
# D ptg
#1: Dd 29.10448
#2: Ds 20.14925
#3: Dx 17.08126
#4: Dp 12.76949

Resources