I have a data table like this:
DT <- data.table(ID=rep(c(1:2),each=6), year=rep(c(2003:2006),each=3), month=rep(c(5:8),3), day=rep(c(11:14),3),value=c(101:112))
And I would like to add columns with the conditions:
1, add 5 columns with names: V100, V102, V105, V108, V112
2, in each column, grouped by ID and year, sum up the values less than the value in the column name, eg: for column V112, sum up grouped values less than 112
So the outcome will look like:
DT1 <- data.table(ID=rep(c(1:2),each=2), year=c(2003:2006), "100"=rep(0,4), "102"=c(2,0,0,0),"105"=c(3,2,0,0),"108"=c(3,3,2,0),"112"=rep(3,4))
I tried write codes but couldn't figure out:
degree <- c(100,102,105,108,112)
for (d in degree)
{
f_year <- function(d) {sum(DT$value <= d)}
DT <- DT[,d:=f_year(),by=list(ID,year)]
}
Any help would be appreciated!
Thats what lapply can be used for.
degree <- c(100, 102, 105, 108, 112)
myfun <- function(x,y) sum(y <= x)
DT1 <- DT[, lapply(degree, myfun, value), by = .(ID, year)]
setnames(DT1, c("ID", "year", as.character(degree)))
Result:
> DT1
ID year 100 102 105 108 112
1: 1 2003 0 2 3 3 3
2: 1 2004 0 0 2 3 3
3: 2 2005 0 0 0 2 3
4: 2 2006 0 0 0 0 3
Just another way:
cols = c(100,102,105,108,112)
DT[, lapply(cols, function(x) sum(value <= x)), by=.(ID, year)]
# ID year V1 V2 V3 V4 V5
# 1: 1 2003 0 2 3 3 3
# 2: 1 2004 0 0 2 3 3
# 3: 2 2005 0 0 0 2 3
# 4: 2 2006 0 0 0 0 3
Then you can set the names.
Instead if you'd like to set names directly, then you can create a named list first:
named_cols = setattr(as.list(cols), 'names', cols)
DT[, lapply(named_cols, function(x) sum(value<=x)), by=.(ID, year)]
# ID year 100 102 105 108 112
# 1: 1 2003 0 2 3 3 3
# 2: 1 2004 0 0 2 3 3
# 3: 2 2005 0 0 0 2 3
# 4: 2 2006 0 0 0 0 3
Related
I have a following data frame:
set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)
df<-data.frame(id, var1)
I would like to remove zeros at the end of var1 for each id if the number of repeating zeros
is larger than or equal to the number of subsequently repeating zeros anywhere within the data frame for a particular id.
example:
df[df$id=="a",]
id var1
1 a 0
2 a 1
3 a 1
4 a 0
5 a 0
6 a 0
7 a 0
8 a 0
9 a 0
10 a 0
11 a 0
12 a 1
13 a 0
14 a 0
15 a 0
16 a 0
17 a 0
18 a 0
19 a 0
20 a 0
For id "a", we have a series of 8 subsequent zeros at the end, which is the same length as a previous series of zeros, therefore, zeros at the end should be removed and new id "a" should look like this:
id var1
1 a 0
2 a 1
3 a 1
4 a 0
5 a 0
6 a 0
7 a 0
8 a 0
9 a 0
10 a 0
11 a 0
12 a 1
For id "b", we see there are only 2 subsequent zeros at the end, which is less than max number of subsequent zeros and therefore nothing should be done.
df[df$id=="b",]
id var1
21 b 0
22 b 1
23 b 0
24 b 0
25 b 0
26 b 0
27 b 0
28 b 0
29 b 0
30 b 0
31 b 0
32 b 1
33 b 1
34 b 1
35 b 0
36 b 1
37 b 0
38 b 1
39 b 0
40 b 0
set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)
df<-data.frame(id, var1)
library(data.table)
library(magrittr)
setDT(df)
to_remove <-
# get all run lengths of 0s
df[, .N, .(id, var1, rleid(var1))][var1 == 0] %>%
# only for ids with trailing 0s
.[df[, if (last(var1) == 0) id, id], on = .(id)] %>%
# only if the last is longer than all previous for that id
.[, if (last(N) >= max(N[-.N])) .(n_rem = last(N)), id] %>%
.[, setNames(n_rem, id)]
to_remove
#> a h
#> 8 8
df[, head(.SD, .N - fcoalesce(to_remove[id], 0L))
, by = id]
#> id var1
#> 1: a 0
#> 2: a 1
#> 3: a 1
#> 4: a 0
#> 5: a 0
#> ---
#> 180: j 0
#> 181: j 0
#> 182: j 1
#> 183: j 0
#> 184: j 0
Created on 2021-11-24 by the reprex package (v2.0.1)
In a single chain:
df[, .N, .(id, var1, rleid(var1))][var1 == 0] %>%
.[df[, if (last(var1) == 0) id, id], on = .(id)] %>%
.[, if (last(N) >= max(N[-.N])) .(n_rem = last(N)), id] %>%
.[, setNames(n_rem, id)] %>%
{df[, head(.SD, .N - fcoalesce(.[id], 0L)), id]}
# id var1
# 1: a 0
# 2: a 1
# 3: a 1
# 4: a 0
# 5: a 0
# ---
# 180: j 0
# 181: j 0
# 182: j 1
# 183: j 0
# 184: j 0
We create a function with rle - with two parameters 'x' and the threshold ('thresh'), apply the rle (run-length-encoding) on the input 'x' (rle - returns a list output with lengths and values as two vectors). Check whether the last element of values is 0 and its corresponding lengths is greater than or equal to the threshold passed, then replace the last element of logical TRUE vector ('tmp1') to FALSE, and return the replicated 'tmp1'.
Do a group by 'id' in ave, apply the function and subset the rows
f1 <- function(x, thresh) {
with(rle(x), {
tmp1 <- rep(TRUE, length(values))
tmp2 <- values[length(values)]
tmp1[length(tmp1)][tmp2 == 0 & lengths[length(values)] >= thresh] <- FALSE
rep(tmp1, lengths)
})
}
-testing
out <- subset(df, as.logical(ave(var1, id, FUN = function(x) f1(x, 8))))
-output
> subset(out, id == 'a')
id var1
1 a 0
2 a 1
3 a 1
4 a 0
5 a 0
6 a 0
7 a 0
8 a 0
9 a 0
10 a 0
11 a 0
12 a 1
> subset(out, id == 'b')
id var1
21 b 0
22 b 1
23 b 0
24 b 0
25 b 0
26 b 0
27 b 0
28 b 0
29 b 0
30 b 0
31 b 0
32 b 1
33 b 1
34 b 1
35 b 0
36 b 1
37 b 0
38 b 1
39 b 0
40 b 0
Version that is a single pipeline for no particular reason:
set.seed(12)
id<-rep(letters[1:10],each=20)
var1<-rbinom(200,1,0.25)
df<-data.frame(id, var1)
df %>%
group_by(id) %>%
tidyr::nest() %>%
dplyr::mutate(
data = purrr::map(data, ~rle(.x$var1)),
max = purrr::map_int(data, ~max(.x$lengths[.x$values == 0])),
last = purrr::map_int(data, ~{
.x$lengths[.x$values == 0][length(.x$lengths[.x$values == 0])]
})
) %>%
dplyr::mutate(
data = purrr::map(
data, ~{
if(max > last) {
x <- inverse.rle(.x)
len <- length(x)
x[(len - last):len] <- NA
x
} else {
inverse.rle(.x)
}
}
)
) %>%
dplyr::select(id, data) %>%
tidyr::unnest(c(id, data)) %>%
tidyr::drop_na()
I have a df called laws with a row for each law (one for each id):
laws <- data.frame(id=c(1,2,3),beginyear=c(2001,2002,2005),endyear=c(2003,2005,2006), law1=c(0,0,1), law2=c(1,0,1))
from which I want to create second called idyear with a row for each id and year:
idyear <- data.frame(id=c(rep(1,6),rep(2,6),rep(3,6)), year=(rep(c(2001:2006),3)), law1=c(rep(0,16),1,1), law2=c(1,1,1,rep(0,13),1,1))
How would I efficiently go about writing some code to get the idyear df output from the laws df? The two law variables are indicator variables == 1 if the idyear$year is >= laws$beginyear AND idyear$year is <= laws$endyear.
I am a beginner with R, but I'm willing to try anything (apply, loops, etc.) to get this to work.
1) base expand.grid will create an 18 x 2 data frame of all id and year combinations and then merge will merge it back together with laws. Zero out any law1 and law2 entry for which year is not between beginyear and endyear. Finally drop the beginyear and endyear columns. No packages are used.
g <- with(laws, expand.grid(year = min(beginyear):max(endyear), id = id))
m <- merge(g, laws)
m[m$year < m$beginyear | m$year > m$endyear, c("law1", "law2")] <- 0
m <- subset(m, select = - c(beginyear, endyear))
# check
identical(m, idyear)
## [1] TRUE
2) magrittr This is the same solution as (1) except we have used magrittr pipelines to express it. Note the mixture of pipe operators.
library(magrittr)
laws %$%
expand.grid(year = min(beginyear):max(endyear), id = id) %>%
merge(laws) %$%
{ .[year < beginyear | year > endyear, c("law1", "law2")] <- 0; .} %>%
subset(select = - c(beginyear, endyear))
Update: Fixed. Added (2).
A solution using tidyverse. The last as.data.frame() is optional, which just convert the tbl to a data frame.
library(tidyverse)
idyear <- laws %>%
mutate(year = map2(beginyear, endyear, `:`)) %>%
unnest() %>%
complete(id, year = full_seq(year, period = 1L), fill = list(law1 = 0L, law2 = 0L)) %>%
select(-beginyear, -endyear) %>%
as.data.frame()
idyear
# id year law1 law2
# 1 1 2001 0 1
# 2 1 2002 0 1
# 3 1 2003 0 1
# 4 1 2004 0 0
# 5 1 2005 0 0
# 6 1 2006 0 0
# 7 2 2001 0 0
# 8 2 2002 0 0
# 9 2 2003 0 0
# 10 2 2004 0 0
# 11 2 2005 0 0
# 12 2 2006 0 0
# 13 3 2001 0 0
# 14 3 2002 0 0
# 15 3 2003 0 0
# 16 3 2004 0 0
# 17 3 2005 1 1
# 18 3 2006 1 1
Use of mapply function can help.
# Function to expand year between begin and end
gen_data <- function(x_id, x_beginyear, x_endyear, x_law1, x_law2){
df <- data.frame(x_id, x_beginyear:x_endyear, x_law1, x_law2)
df
}
idyearlst <- data.frame()
idyearlst <- rbind(idyearlst, mapply(gen_data, laws$id, laws$beginyear,
laws$endyear, laws$law1, laws$law2))
# Finally convert list to data.frame
idyear <- setNames(do.call(rbind.data.frame, idyearlst), c("id", "year", "law1", "law2"))
Result will be like:
> idyear
id year law1 law2
V1.1 1 2001 0 1
V1.2 1 2002 0 1
V1.3 1 2003 0 1
V2.4 2 2002 0 0
V2.5 2 2003 0 0
V2.6 2 2004 0 0
V2.7 2 2005 0 0
V3.8 3 2005 1 1
V3.9 3 2006 1 1
Kind of an ugly approach, but I think it gets what you're after, using G. Grothendieck's g expand.grid data frame as a base, and your laws dataframe.
new.df <- data.frame(t(apply(g, 1, function(x){
yearspan = laws[laws$id == x['id'], 'beginyear']:laws[laws$id == x['id'], 'endyear']
law1 = laws$law1[laws$id == x['id'] & x['year'] %in% yearspan]
law2 = laws$law2[laws$id == x['id'] & x['year'] %in% yearspan]
x['law1'] = ifelse(length(law1 > 0), law1, 0)
x['law2'] = ifelse(length(law2 > 0), law2, 0)
return(x)
})))
> new.df
id year law1 law2
1 1 2001 0 1
2 1 2002 0 1
3 1 2003 0 1
4 1 2004 0 0
5 1 2005 0 0
6 1 2006 0 0
7 2 2001 0 0
8 2 2002 0 0
9 2 2003 0 0
10 2 2004 0 0
11 2 2005 0 0
12 2 2006 0 0
13 3 2001 0 0
14 3 2002 0 0
15 3 2003 0 0
16 3 2004 0 0
17 3 2005 1 1
18 3 2006 1 1
Libraries:
dplyr (for arrange, not really necessary)
Data:
laws <- data.frame(id=c(1,2,3),
beginyear=c(2001,2002,2005),
endyear=c(2003,2005,2006),
law1=c(0,0,1), law2=c(1,0,1))
g <- with(laws, expand.grid(id = id, year = min(beginyear):max(endyear)))
g <- arrange(g, id)
If there is a sample data set as below.
> tmp <- data.table(x=c(1:10),y=(5:14))
> tmp
x y
1: 1 5
2: 2 6
3: 3 7
4: 4 8
5: 5 9
6: 6 10
7: 7 11
8: 8 12
9: 9 13
10: 10 14
I want choose two lowest number and I want change 0 value to the other numbers.
like
x y
1: 1 5
2: 2 6
3: 0 0
4: 0 0
5: 0 0
6: 0 0
7: 0 0
8: 0 0
9: 0 0
10: 0 0
I think the coding is
tmp[, c("x","y"):=lapply(.SD, x[which(!x %in% sort(x)[1:2])] = 0}), .SDcols=c("x","y")]
but it changes all 0
How can i solve this problem.
To expand on my comment, I'd do something like this:
for (j in names(tmp)) {
col = tmp[[j]]
min_2 = sort.int(unique(col), partial=2L)[2L] # 2nd lowest value
set(tmp, i = which(col > min_2), j = j, value = 0L)
}
This loops over all the columns in tmp, and gets the 2nd minimum value for each column using sort.int with partial argument, which is slightly more efficient than using sort (as we don't have to sort the entire data set to find the 2nd minimum value).
Then we use set() to replace those rows where the column value is greater than the 2nd minimum value, for that column, with the value 0.
May be you can try
tmp[, lapply(.SD, function(x) replace(x,
!rank(x, ties.method='first') %in% 1:2, 0))]
# x y
#1: 1 5
#2: 2 6
#3: 0 0
#4: 0 0
#5: 0 0
#6: 0 0
#7: 0 0
#8: 0 0
#9: 0 0
#10:0 0
I would like to create a panel from a dataset that has one observation for every given time period such that every unit has a new observation for every time period. Using the following example:
id <- seq(1:4)
year <- c(2005, 2008, 2008, 2007)
y <- c(1,0,0,1)
frame <- data.frame(id, year, y)
frame
id year y
1 1 2005 1
2 2 2008 0
3 3 2008 0
4 4 2007 1
For each unique ID, I would like there to be a unique observation for the year 2005, 2006, 2007, and 2008 (the lower and upper time periods on this frame), and set the outcome y to 0 for all the times in which there isn't an existing observation, such that the new frame looks like:
id year y
1 1 2005 1
2 1 2006 0
3 1 2007 0
4 1 2008 0
....
13 4 2005 0
14 4 2006 0
15 4 2007 1
16 4 2008 0
I haven't had much success with loops; Any and all thoughts would be greatly appreciated.
1) reshape2 Create a grid g of all years and id values crossed and rbind it with frame.
Then using the reshape2 package cast frame from long to wide form and then melt it back to long form. Finally rearrange the rows and columns as desired.
The lines ending in one # are only to ensure that every year is present so if we knew that were the case those lines could be omitted. The line ending in ## is only to rearrange the rows and columns so if that did not matter that line could be omitted too.
library(reshape2)
g <- with(frame, expand.grid(year = seq(min(year), max(year)), id = unique(id), y = 0)) #
frame <- rbind(frame, g) #
wide <- dcast(frame, year ~ id, fill = 0, fun = sum, value.var = "y")
long <- melt(wide, id = "year", variable.name = "id", value.name = "y")
long <- long[order(long$id, long$year), c("id", "year", "y")] ##
giving:
> long
id year y
1 1 2005 1
2 1 2006 0
3 1 2007 0
4 1 2008 0
5 2 2005 0
6 2 2006 0
7 2 2007 0
8 2 2008 0
9 3 2005 0
10 3 2006 0
11 3 2007 0
12 3 2008 0
13 4 2005 0
14 4 2006 0
15 4 2007 1
16 4 2008 0
2) aggregate A shorter solution would be to run just the two lines that end with # above and then follow those with an aggregate as shown. This solution uses no addon packages.
g <- with(frame, expand.grid(year = seq(min(year), max(year)), id = unique(id), y = 0)) #
frame <- rbind(frame, g) #
aggregate(y ~ year + id, frame, sum)[c("id", "year", "y")]
This gives the same answer as solution (1) except as noted by a commenter solution (1) above makes id a factor whereas it is not in this solution.
Using data.table:
require(data.table)
DT <- data.table(frame, key=c("id", "year"))
comb <- CJ(1:4, 2005:2008) # like 'expand.grid', but faster + sets key
ans <- DT[comb][is.na(y), y:=0L] # perform a join (DT[comb]), then set NAs to 0
# id year y
# 1: 1 2005 1
# 2: 1 2006 0
# 3: 1 2007 0
# 4: 1 2008 0
# 5: 2 2005 0
# 6: 2 2006 0
# 7: 2 2007 0
# 8: 2 2008 0
# 9: 3 2005 0
# 10: 3 2006 0
# 11: 3 2007 0
# 12: 3 2008 0
# 13: 4 2005 0
# 14: 4 2006 0
# 15: 4 2007 1
# 16: 4 2008 0
maybe not an elegant solution, but anyway:
df <- expand.grid(id=id, year=unique(year))
frame <- frame[frame$y != 0,]
df$y <- 0
df2 <- rbind(frame, df)
df2 <- df2[!duplicated(df2[,c("id", "year")]),]
df2 <- df2[order(df2$id, df2$year),]
rownames(df2) <- NULL
df2
# id year y
# 1 1 2005 1
# 2 1 2006 0
# 3 1 2007 0
# 4 1 2008 0
# 5 2 2005 0
# 6 2 2006 0
# 7 2 2007 0
# 8 2 2008 0
# 9 3 2005 0
# 10 3 2006 0
# 11 3 2007 0
# 12 3 2008 0
# 13 4 2005 0
# 14 4 2006 0
# 15 4 2007 1
# 16 4 2008 0
This question already has answers here:
Dummy variables from a string variable
(7 answers)
Closed 9 years ago.
I have read this file into a data.frame in R, and as you can see the 5th column contains some values separated with ";". Is it possible to turn this data.frame to a much larger data.frame and expand the 5th column into a binary vector?
> head(uinfo)
V1 V2 V3 V4 V5
1 100044 1899 1 5 831;55;198;8;450;7;39;5;111
2 100054 1987 2 6 0
3 100065 1989 1 57 0
4 100080 1986 1 31 113;41;44;48;91;96;42;79;92;35
5 100086 1986 1 129 0
6 100097 1981 1 75 0
So, as a simpler example, if my first two rows are:
1 100044 1899 1 5 1;2;4;7
2 100054 1987 2 6 3;8
I want to get:
1 100044 1899 1 5 1 1 0 1 0 0 1 0 0 0
2 100054 1987 2 6 0 0 1 0 0 0 0 1 0 0
Do I have to use another program such as python for preprocessing of the data, or is it possible to do so by some apply function?
Thanks
You can try the concat.split.expanded function from my "splitstackshape" package:
library(splitstackshape)
mydf
# V1 V2 V3 V4 V5
# 1 100044 1899 1 5 1;2;4;7
# 2 100054 1987 2 6 3;8
concat.split.expanded(mydf, "V5", sep=";", fill = 0)
# V1 V2 V3 V4 V5 V5_1 V5_2 V5_3 V5_4 V5_5 V5_6 V5_7 V5_8
# 1 100044 1899 1 5 1;2;4;7 1 1 0 1 0 0 1 0
# 2 100054 1987 2 6 3;8 0 0 1 0 0 0 0 1
Add drop = TRUE to get rid of the original column.
Here, "mydf" is defined as:
mydf <- structure(list(V1 = c(100044L, 100054L), V2 = c(1899L, 1987L),
V3 = 1:2, V4 = 5:6, V5 = c("1;2;4;7", "3;8")), .Names = c("V1",
"V2", "V3", "V4", "V5"), class = "data.frame", row.names = c(NA, -2L))
Using base functions (too many steps I think)
> df <- read.table(text=" 100044 1899 1 5 1;2;4;7
+ 100054 1987 2 6 3;8", header=F, stringsAsFactors=F) # data.frame
> pos <- sapply(strsplit(as.character(df[,5]), ";"), as.numeric)
> x <-rep(0, max(unlist(pos)))
> cbind(df, t(sapply(pos, function(y) replace(x, y, 1))))
V1 V2 V3 V4 V5 1 2 3 4 5 6 7 8
1 100044 1899 1 5 1;2;4;7 1 1 0 1 0 0 1 0
2 100054 1987 2 6 3;8 0 0 1 0 0 0 0 1
The idea here is that there are two requirements:
split the data on the semicolon`
create the columns, filling empty cols with zero/FALSE
# 1 is straightforward: use strsplit.
# 2 can be accomplished by sequencing along the number of new columns, and checking if they're in the newly splatted string.
library(data.table)
largest <- 1e3 # (Whatever your largest expected value)
newColNames <- as.character(seq(largest))
dat[, (newColNames) := as.data.table(t(sapply(strsplit(V5, ";"), "%in%", x=seq(largest))))]
# if you really want numeric (as opposed to logical)
dat[, (newColNames) := lapply(.SD, as.numeric), .SDcols=newColNames]