i have a data frame
which is
name val1 val2
A 2 3
B 5 6
C 7 9
i want a dataframe
name Grp value
A val1 2
B val1 5
C val1 7
A val2 3
B val2 6
C val2 9
what is the optimal solution ?
It's called reshaping a data.frame from wide to long format. One option to do that would be, using the package tidyr:
library(tidyr)
gather(df, Grp, Value, -name)
# name Grp Value
#1 A val1 2
#2 B val1 5
#3 C val1 7
#4 A val2 3
#5 B val2 6
#6 C val2 9
Or just use melt from reshape2 package
library(reshape2)
melt(df)
# name variable value
# 1 A val1 2
# 2 B val1 5
# 3 C val1 7
# 4 A val2 3
# 5 B val2 6
# 6 C val2 9
Edit: per #AnandaMahtos comment, if you insist on having your variable name called Grp, you can alternatively do:
melt(df, variable.name = "Grp")
A base R option would be to use stack
cbind(name=df$name, stack(df, select=-name))
# name values ind
#1 A 2 val1
#2 B 5 val1
#3 C 7 val1
#4 A 3 val2
#5 B 6 val2
#6 C 9 val2
Or without any extra package:
ddff<- data.frame(name=c("A","B","C"),val1=c(2,5,7),val2=c(3,6,9))
ddff2<- reshape(ddff,direction="long", varying = list(names(ddff)[2:3]),v.names=c("values"),times=c("val1","val2"))[,1:3]
ddff2
# name time values
#1.val1 A val1 2
#2.val1 B val1 5
#3.val1 C val1 7
#1.val2 A val2 3
#2.val2 B val2 6
#3.val2 C val2 9
Related
I would like to replace missing value from value in another column, if all values within a group are missing. Here is example and something I thought would work. There can be unlimited amount of groups.
library(tidyverse)
df <- tibble(ID = c("A", "A", "A", "B", "B", "B"),
val1 = c(1,2,3,4,5,6),
val2 = c(NA, NA, NA, NA, 2, 3))
df %>%
group_by(ID) %>%
mutate(val2 = ifelse(all(is.na(val2)), val1, val2))
# Groups: ID [2]
ID val1 val2
<chr> <dbl> <dbl>
1 A 1 1
2 A 2 1
3 A 3 1
4 B 4 NA
5 B 5 NA
6 B 6 NA
What I would like to get is val2 should get values from val1, if all val2 values are missing within group. Now it seems that it is giving me the first value. Nothing should happen if all are not missing.
Result:
# A tibble: 6 x 3
ID val1 val2
<chr> <dbl> <dbl>
1 A 1 1
2 A 2 2
3 A 3 3
4 B 4 NA
5 B 5 2
6 B 6 3
Does this work:
library(dplyr)
df %>% group_by(ID) %>% mutate(val2 = case_when(all(is.na(val2)) ~ val1, TRUE ~ val2))
# A tibble: 6 x 3
# Groups: ID [2]
ID val1 val2
<chr> <dbl> <dbl>
1 A 1 1
2 A 2 2
3 A 3 3
4 B 4 NA
5 B 5 2
6 B 6 3
You almost had it. I create an indicator which is used to replace the values:
df %>%
group_by(ID) %>%
mutate(val3 = ifelse(all(is.na(val2)),1,0)) %>%
ungroup() %>%
mutate(val2 = ifelse(val3 == 1, val1, val2)) %>%
select(-val3)
Output:
# A tibble: 6 x 3
ID val1 val2
<chr> <dbl> <dbl>
1 A 1 1
2 A 2 2
3 A 3 3
4 B 4 NA
5 B 5 2
6 B 6 3
I have a data frame in R that I've edited to look like the following:
enter image description here
I would like to change the cells in column 2 with an 'x' to match the word at the top of the list of x's. For example, I would like column 2 rows 8-20 to list the word "farm_working_expenses", column 2 rows 22-27 to list the word "feed_expenses", etc.
Is there any way to edit the 'x' cells in the data frame to match the contents of a previous cell in the same column?
dplyr::na_if() can be used to replace x with NA.
data1 <- data.frame(col1 = LETTERS[1:10],
col2 = c('val1',rep('x',3),'val2',rep('x',3),'val3','x'))
data1
# col1 col2
# 1 A val1
# 2 B x
# 3 C x
# 4 D x
# 5 E val2
# 6 F x
# 7 G x
# 8 H x
# 9 I val3
# 10 J x
data1$col2 <- dplyr::na_if(data1$col2, 'x')
Using tidyr::fill(), last observation can be carried forward till a new value appears in the same column.
data1 %>% tidyr::fill(col2)
# col1 col2
# 1 A val1
# 2 B val1
# 3 C val1
# 4 D val1
# 5 E val2
# 6 F val2
# 7 G val2
# 8 H val2
# 9 I val3
# 10 J val3
zoo::na.locf can also be used for similar purpose.
data1 <- data.frame(col1 = LETTERS[1:10],
col2 = c('val1',rep(NA,3),'val2',rep(NA,3),'val3',NA))
data1
# col1 col2
# 1 A val1
# 2 B <NA>
# 3 C <NA>
# 4 D <NA>
# 5 E val2
# 6 F <NA>
# 7 G <NA>
# 8 H <NA>
# 9 I val3
# 10 J <NA>
zoo::na.locf(data1$col2)
# [1] val1 val1 val1 val1 val2 val2 val2 val2 val3 val3
I have a large data frame with groups and subgroups. I would like to determine the index of the subgroup in each group, like shown in the OUTPUT column of the following data frame:
df <- data.frame(
Group = factor(c("A","A","A","A","A","B","B","B","B")),
Subgroup = factor(c("a","a","b","b","b","a","a","b","b")),
OUTPUT = c(1,1,2,2,2,1,1,2,2)
)
I've tried several possibilities with without any success. I'd like to work with dplyr, but I'm not sure how to go about this. The following code returns an unexpected result.
require(dplyr)
df <- df %>%
group_by(Group) %>%
mutate(
OUTPUT_2 = dplyr::id(Subgroup)
)
#df
# Group Subgroup OUTPUT_2
# (fctr) (fctr) (int)
#1 A a 8
#2 A a 8
#3 A b 8
#4 A b 8
#5 A b 8
#6 B a 4
#7 B a 4
#8 B b 4
#9 B b 4
I've the feeling I'm close, but not getting there. Can anybody help?
Here is a solution with data.table without aggregation:
dt[order(Subgroup), Output := cumsum(!duplicated(Subgroup)) , by = .(Group)]
This will be much faster compared to methods based on aggregation.
We can use the factor route with dplyr
library(dplyr)
df %>%
group_by(Group) %>%
mutate(OUTPUT = as.numeric(factor(Subgroup, levels= unique(Subgroup))))
# Group Subgroup OUTPUT
# <fctr> <fctr> <dbl>
#1 A a 1
#2 A a 1
#3 A b 2
#4 A b 2
#5 A b 2
#6 B a 1
#7 B a 1
#8 B b 2
#9 B b 2
Or another option is match with the unique elements of 'Subgroup' after grouping by 'Group'
df %>%
group_by(Group) %>%
mutate(OUTPUT = match(Subgroup, unique(Subgroup)) )
# Group Subgroup OUTPUT
# <fctr> <fctr> <int>
#1 A a 1
#2 A a 1
#3 A b 2
#4 A b 2
#5 A b 2
#6 B a 1
#7 B a 1
#8 B b 2
#9 B b 2
library(data.table)
dt = as.data.table(df) # or setDT to convert in place
unique(dt[, .(Group, Subgroup)])[, idx := 1:.N, by = Group][dt, on = c('Group', 'Subgroup')]
# Group Subgroup idx OUTPUT
#1: A a 1 1
#2: A a 1 1
#3: A b 2 2
#4: A b 2 2
#5: A b 2 2
#6: B a 1 1
#7: B a 1 1
#8: B b 2 2
#9: B b 2 2
Translation to dplyr should be straightforward.
Another method, following the idea of using factors from aosmith's comment, is:
dt[, idx := as.integer(factor(Subgroup, unique(Subgroup))), by = Group][]
This will create a factor with correct levels per Group which is the indexing you're after.
I've got a dataframe consisting of a group and 2 value columns, as such:
group val1 val2
A 5 3
A 2 4
A 3 1
B 3 6
B 2 1
B 0 2
I want to work out the number of rows where val1 > val2, split by subset. Initially I hardcoded this per subgroup with:
number_a <- nrow(subset(df, group=="A" & val1 > val2))
number_b <- nrow(subset(df, group=="B" & val1 > val2))
What's the proper way of automating this? I tried using the split() function but I couldn't work out how to pass in both val1 and val2 column.
Pretty straight forward using data.table
If you want the number of rows
library(data.table)
setDT(df)[, .(RowsNum = sum(val1 > val2)), by = group]
# group RowsNum
# 1: A 2
# 2: B 1
If you looking for split, apply combinations in base R, could also try
sapply(split(df[-1], df[1]), function(x) sum(x[1] > x[2]))
# A B
# 2 1
Or using tapply (also from base R)
tapply(with(df, val1 > val2), df[1], sum)
# group
# A B
# 2 1
If you want the rows themselves
setDT(df)[, .SD[val1 > val2]]
# group val1 val2
# 1: A 5 3
# 2: A 3 1
# 3: B 2 1
Or very simple with base R too
df[with(df, val1 > val2), ]
# group val1 val2
# 1 A 5 3
# 3 A 3 1
# 5 B 2 1
Or
subset(df, val1 > val2)
# group val1 val2
# 1 A 5 3
# 3 A 3 1
# 5 B 2 1
Another option using dplyr
library(dplyr)
filter(df, val1 >val2)
# group val1 val2
#1 A 5 3
#2 A 3 1
#3 B 2 1
If you need the nrows
df %>%
group_by(group) %>%
filter(val1 >val2) %>%
summarise(RowsNum=n())
# group RowsNum
#1 A 2
#2 B 1
Or using aggregate from base R
aggregate(cbind(RowsNum = val1 > val2) ~ group, df, sum)
# group RowsNum
#1 A 2
#2 B 1
You can try this
data <- data.frame(group,val1,val2)
attach(data)
aggregate(val1~group,data[which(val1 > val2),],length)
I'm having an issue converting the VBA code seen in this this post to an R-script.
The problem is as follows, I have a column (from a source database, not by choice) that contains multiple values for the attribute. I'd like to normalize this table and retain the order in which each value occurs in each cell.
An example dataset:
dat <- data.frame(
ID = c(1:3),
Multi = c("VAL1 VAL2 VAL3","VAL2 VAL3","VAL3 VAL1")
,stringsAsFactors=FALSE)
ID Multi
1 1 VAL1 VAL2 VAL3
2 2 VAL2 VAL3
3 3 VAL2 VAL3 VAL1
The pseudocode would be something like:
Loop over each row
Split string Multi with a space as a separator
For each splitted string, append new row in a separate data.frame with the ID, Order within the total string and the value.
The result would look like:
ID Order Multi
1 1 1 VAL1
2 1 2 VAL2
3 1 3 VAL3
4 2 1 VAL2
5 2 2 VAL3
6 3 1 VAL2
7 3 2 VAL3
8 3 3 VAL1
I'm currently looking at doing so with a data.frame, I'm thinking data.table would be more appropriate as my table will have approximately 400.000 of these rows.
I apologize for not having any code ready, I'm still contemplating whether I need to use the apply family, data.table or a simple for loop. I'll keep this post updated with any progress I make.
Here are a couple of ways...
In base R:
X <- setNames(strsplit(as.character(dat$Multi), " "), dat$ID)
X1 <- stack(X)
X1$order <- ave(X1$ind, X1$ind, FUN = seq_along)
X1
# values ind order
# 1 VAL1 1 1
# 2 VAL2 1 2
# 3 VAL3 1 3
# 4 VAL2 2 1
# 5 VAL3 2 2
# 6 VAL2 3 1
# 7 VAL3 3 2
# 8 VAL1 3 3
OR (better):
X <- strsplit(as.character(dat[, "Multi"]), " ", fixed = TRUE)
len <- vapply(X, length, 1L)
data.frame(ID = rep(dat[, "ID"], len), order = sequence(len),
Multi = unlist(X, use.names=FALSE))
Using concat.split.multiple from my "splitstackshape" package (probably not too efficient on 400,000 rows though).
library(splitstackshape)
out <- concat.split.multiple(dat, "Multi", " ", "long")
out[order(out$ID, out$time), ]
# ID time Multi
# 1 1 1 VAL1
# 4 1 2 VAL2
# 7 1 3 VAL3
# 2 2 1 VAL2
# 5 2 2 VAL3
# 8 2 3 <NA>
# 3 3 1 VAL2
# 6 3 2 VAL3
# 9 3 3 VAL1
And, since you requested "data.table":
library(data.table)
DT <- data.table(dat)
DTL <- DT[, list(unlist(strsplit(as.character(Multi), " "))), by = ID]
DTL[, order := sequence(.N), by = ID]
DTL
# ID V1 order
# 1: 1 VAL1 1
# 2: 1 VAL2 2
# 3: 1 VAL3 3
# 4: 2 VAL2 1
# 5: 2 VAL3 2
# 6: 3 VAL2 1
# 7: 3 VAL3 2
# 8: 3 VAL1 3
Update: Some timings
I didn't bother testing my "splitstackshape" approach, because that uses read.table under the hood, so I know it won't stand up to the demands of performance.
However, base R still seems to win out!
Sample "big" data:
dat_big <- do.call(rbind, replicate(floor(4000/3), dat, simplify = FALSE))
dat_big <- do.call(rbind, replicate(100, dat_big, simplify = FALSE))
dat_big$ID <- make.unique(as.character(dat_big$ID))
DT <- data.table(dat)
DT_big <- data.table(dat_big)
Functions to test:
fun1 <- function(inDF) {
X <- strsplit(as.character(inDF[, "Multi"]), " ", fixed = TRUE)
len <- vapply(X, length, 1L)
data.frame(ID = rep(inDF[, "ID"], len), order = sequence(len),
Multi = unlist(X, use.names=FALSE))
}
fun2 <- function(inDT) {
DTL <- inDT[, list(unlist(strsplit(as.character(Multi), " ", fixed = TRUE))), by = ID]
DTL[, order := sequence(.N), by = ID]
DTL
}
The results for base R:
system.time(outDF <- fun1(dat_big))
# user system elapsed
# 6.418 0.000 6.454
dim(outDF)
# [1] 1066400 3
head(outDF)
# ID order Multi
# 1 1 1 VAL1
# 2 1 2 VAL2
# 3 1 3 VAL3
# 4 2 1 VAL2
# 5 2 2 VAL3
# 6 3 1 VAL2
tail(outDF)
# ID order Multi
# 1066395 1.133299 3 VAL3
# 1066396 2.133299 1 VAL2
# 1066397 2.133299 2 VAL3
# 1066398 3.133299 1 VAL2
# 1066399 3.133299 2 VAL3
# 1066400 3.133299 3 VAL1
The results for "data.table":
system.time(outDT <- fun2(DT_big))
# user system elapsed
# 14.035 0.000 14.057
dim(outDT)
# [1] 1066400 3
outDT
# ID V1 order
# 1: 1 VAL1 1
# 2: 1 VAL2 2
# 3: 1 VAL3 3
# 4: 2 VAL2 1
# 5: 2 VAL3 2
# ---
# 1066396: 2.133299 VAL2 1
# 1066397: 2.133299 VAL3 2
# 1066398: 3.133299 VAL2 1
# 1066399: 3.133299 VAL3 2
# 1066400: 3.133299 VAL1 3