Wide to long with many different columns - r
I have used pivot_longer before but this time I have a much more complex wide dataframe and I cannot sort it out. The example code will provide you a reproducible dataframe. I haven't dealt with such thing before so I'm not sure it's correct to try to format this type of df in long format?
df <- data.frame(
ID = as.numeric(c("7","8","10","11","13","15","16")),
AGE = as.character(c("45 – 54","25 – 34","25 – 34","25 – 34","25 – 34","18 – 24","35 – 44")),
GENDER = as.character(c("Female","Female","Male","Female","Other","Male","Female")),
SD = as.numeric(c("3","0","0","0","3","2","0")),
GAMING = as.numeric(c("0","0","0","0","2","2","0")),
HW = as.numeric(c("2","2","0","2","2","2","2")),
R1_1 = as.numeric(c("10","34","69","53","79","55","28")),
M1_1 = as.numeric(c("65","32","64","53","87","55","27")),
P1_1 = as.numeric(c("65","38","67","54","88","44","26")),
R1_2 = as.numeric(c("15","57","37","54","75","91","37")),
M1_2 = as.numeric(c("90","26","42","56","74","90","37")),
P1_2 = as.numeric(c("90","44","33","54","79","95","37")),
R1_3 = as.numeric(c("5","47","80","27","61","19","57")),
M1_3 = as.numeric(c("30","71","80","34","71","15","57")),
P1_3 = as.numeric(c("30","36","81","35","62","8","56")),
R2_1 = as.numeric(c("10","39","75","31","71","80","59")),
M2_1 = as.numeric(c("90","51","74","15","70","75","61")),
P2_1 = as.numeric(c("90","52","35","34","69","83","60")),
R2_2 = as.numeric(c("10","45","31","54","39","95","77")),
M2_2 = as.numeric(c("60","70","40","78","5","97","75")),
P2_2 = as.numeric(c("60","40","41","58","9","97","76")),
R2_3 = as.numeric(c("5","38","78","45","25","16","22")),
M2_3 = as.numeric(c("30","34","84","62","33","52","20")),
P2_3 = as.numeric(c("30","34","82","45","32","16","22")),
R3_1 = as.numeric(c("10","40","41","42","62","89","41")),
M3_1 = as.numeric(c("90","67","37","40","27","89","42")),
P3_1 = as.numeric(c("90","34","51","44","38","84","43")),
R3_2 = as.numeric(c("10","37","20","54","8","93","69")),
M3_2 = as.numeric(c("60","38","21","62","5","95","71")),
P3_2 = as.numeric(c("60","38","23","65","14","92","69")),
R3_3 = as.numeric(c("5","30","62","11","60","32","52")),
M3_3 = as.numeric(c("30","67","34","55","45","25","45")),
P3_3 = as.numeric(c("30","28","41","24","53","23","52")),
R1_4 = as.numeric(c("10","40","61","17","39","72","25")),
M1_4 = as.numeric(c("45","20","63","25","62","70","23")),
P1_4 = as.numeric(c("45","52","56","16","26","72","27")),
R2_4 = as.numeric(c("5","21","70","33","80","68","30")),
M2_4 = as.numeric(c("35","21","69","27","85","69","23")),
P2_4 = as.numeric(c("35","32","34","25","79","63","29")),
R3_4 = as.numeric(c("10","29","68","21","8","71","41")),
M3_4 = as.numeric(c("50","37","66","28","33","65","41")),
P3_4 = as.numeric(c("50","38","47","28","24","71","41"))
)
I would like to sort it out like in the following table
the new column names are extracted from the old ones such that (example) in R1_1:
R is the namer of the column containing the value previously stored
in R1_1
1 (the first character after 'R' in R1_1) is the value used
in column Speed
1 (last character of 'R1_1') is the value used in
column Sound
basically each row corresponds to 1 question answered by 1 person, and each question was answered through 3 different ratings (R, M, P)
thank you!
If I understood you correctly, the following should work:
df %>%
pivot_longer(
cols = matches('[RMP]\\d_\\d'),
names_to = c('RMP', 'Speed', 'Sound'),
values_to = 'Data',
names_pattern = '([RMP])(\\d)_(\\d)'
) %>%
pivot_wider(names_from = RMP, values_from = Data)
This assumes that both “speed” and “sound” are single-digit values. If there’s the possibility of multiple digits, the occurrences of \\d in the patterns above need to be replaced by \\d+.
Solution using our good ol' workhorse reshape. At first we grep the names with a "Wd_d" pattern, as well as their suffixes "d_d" for following use in reshape.
nm <- names(df[grep("_\\d", names(df))])
times <- unique(substr(nm, 2, 4))
res <- reshape(df, idvar="ID", varying=7:42, v.names=unique(substr(nm, 1, 1)),
times=times,direction="long")
Getting us close to the result, we just need to strsplit the newly created "time" variable at the "_" and rbind it to the former.
res <- cbind(res, setNames(type.convert(do.call(rbind.data.frame,
strsplit(res$time, "_"))),
c("Speed", "Sound")))
res <- res[order(res$AGE), ] ## some ordering
Result
head(res)
# ID AGE GENDER SD GAMING HW time R M P Speed Sound
# 15.1_1 15 18 – 24 Male 2 2 2 1_1 55 44 55 1 1
# 15.1_2 15 18 – 24 Male 2 2 2 1_2 90 95 91 1 2
# 15.1_3 15 18 – 24 Male 2 2 2 1_3 15 8 19 1 3
# 15.2_1 15 18 – 24 Male 2 2 2 2_1 75 83 80 2 1
# 15.2_2 15 18 – 24 Male 2 2 2 2_2 97 97 95 2 2
# 15.2_3 15 18 – 24 Male 2 2 2 2_3 52 16 16 2 3
Related
R How to count the number of date events in a date range contingent on an ID code
I'm trying to figure out how I can make R count the number of df$event_dt by each individual df$id in the episode.start - episode.end range for that id in All_date_events that AdhereR produces. Essentially I want to find the sum of events for that given ID in the date range. After running this code you'll have 2 data frames. df, transformed from original df with 4 columns is the input data All_date_events is the output I'd like to add a event.num column to. library(dplyr) library(tidyverse) library(AdhereR) library(hablar) library(lubridate) # Starting data df <- data.frame(id = c("1","1","2","2","1","1","1"), event_dt = dmy('2-1-2012', '2-4-2012', '2-5-2012', '2-7-2012', '2-12-2012', '1-3-2013', '22-5-2013')) df <- df %>% mutate(ones = "1", class = "N1") df <- transform(df,ones = as.numeric(ones))) All_date_events <- compute.treatment.episodes(df, ID.colname="id", event.date.colname="event_dt", event.duration.colname="ones", event.daily.dose.colname="ones", medication.class.colname="class", carryover.within.obs.window = TRUE, # carry-over into the OW carry.only.for.same.medication = TRUE, # & only for same type consider.dosage.change = TRUE, # dosage change starts new episode... medication.change.means.new.treatment.episode = TRUE, # & type change maximum.permissible.gap = 90, # & a gap longer than 180 days maximum.permissible.gap.unit = "days", # unit for the above (days) followup.window.start = 0, # 2-years FUW starts at earliest event followup.window.start.unit = "days", followup.window.duration = 365*2, followup.window.duration.unit = "days", date.format = "%d/%m/%Y"); Either of these results is what I'm hoping for id episode.ID episode.start end.episode.gap.days episode.duration episode.end event.num 1 1 2012-01-02 243 92 2012-04-03 2 1 2 2012-12-02 223 172 2013-05-23 3 2 1 2012-05-02 668 62 2012-07-03 2 Id.code Event.date Event.start Event.end Events.num Event.Episode 1 2.1.2012 1 2.4.2012 2.1.2012 2.4.2012 2 1 2 2.5.2012 2 2.7.2012 2.5.2012 2.7.2012 2 1 1 2.12.2012 1 21.3.2013 1 22.5.2013 2.5.2012 22.5.2013 3 2
R sampling with if statement and similar number of sample
I need to to create a sample from my dataframe and to do so I am using the code bellow. name <- sample(c("Adam","John","Henry","Mike"),100,rep = TRUE) area <- sample(c("run","develop","test"),100,rep = TRUE) id <- sample(100:200,100,rep = FALSE) mydata <- as.data.frame(cbind(id,area,name)) qcsample <- mydata %>% group_by(area) %>% nest() %>% mutate(n = c(20, 15, 15)) %>% mutate(samp = map2(data, n, sample_n)) %>% select(area, samp) %>% unnest() Now, I am getting these results. table(qcsample$area) develop run test 15 15 20 -- table(qcsample$name) Adam Henry John Mike 9 9 16 16 I would like to create a sample that would have more or less the same number of samples for each name eg. Adam - 12, Henry - 12, John - 13, Mike - 13. How can I achieve that ? can I somehow request that the sample is equally distributed ? Also, in this example I used function sample_n and specified number of samples. I am anticipating that sometimes there will not be required number from a given group. In my example I am taking 20 samples from area called "test" but sometimes there will be only let's say 10 rows containing "test". The total number is 50 so I need to make sure if there are only 10 "test" the code has to automatically increase the others, so the sample would be "test" - 10, "run" - 20 and "develop" - 20. This can happen to any of the area so I need to test if there is enough rows to create the sample and increase other areas. If there is only 1 it can be added to any of the remaining areas or if the difference is 3 we add 1 to one area and 2 to the another one. How could I check that taking into account all the possibilities ? I believe there are eight permutations in this case. Thanks in advance A.
If you are using made up data then you can create a minimum amount of each row and then create filler to get you up to the total: set.seed(42) names <- c("Adam", "John", "Henry", "Mike") areas <- c("run", "develop", "test") totalrows <- 100 minname <- 22 # No less than 20 of each name (set to near threshold to test) minarea <- 30 # No less than 30 of each area (less randomness the higher these are) qcsample <- data.frame( name=sample(c(rep(names, minname), sample(names, totalrows-length(names)*minname, replace=T))), area=sample(c(rep(areas, minarea), sample(areas, totalrows-length(areas)*minarea, replace=T))), id=sample(99+(1:totalrows)) ) This results in: R> table(qcsample$name) Adam Henry John Mike 23 28 24 25 R> table(qcsample$area) develop run test 37 31 32 Notice that the count of name to area isn't constrained: R> table(qcsample[,-3]) area name develop run test Adam 5 11 7 Henry 11 8 9 John 10 7 7 Mike 11 5 9 R> Using a loop as suggested by #r2evans: library(dplyr) set.seed(42) mydata <- data.frame( name = sample(c("Adam","John","Henry","Mike"), 100, rep = TRUE), area = sample(c("run","develop","test"), 100, rep = TRUE), id = sample(100:200, 100, rep = FALSE) ) Nsamples <- 50 mysample <- data.frame(sample_n(mydata, Nsamples)) minname <- 11 # max is 50/4 -> 12 minarea <- 15 # max is 50/3 -> 16 # the test you were asking about while( (min(table(mysample$name)) < minname) || (min(table(mysample$area)) < minarea) ) { mysample <- data.frame(sample_n(mydata, Nsamples)) } This results in: R> table(mysample$name) Adam Henry John Mike 13 15 11 11 R> table(mysample$area) develop run test 15 17 18 And, like before, there's no minimum of name to area. R> table(mysample[-3]) area name develop run test Adam 4 3 6 Henry 2 6 7 John 4 4 3 Mike 5 4 2 If you needed to enforce an minimum number for each permutation add this to the test: while(... || (min(table(mysample[-3])) < some_min)) { BTW, the number of permutations, as you can see by the table, is the number of names times the number of areas.
Here's another thought. Depending on your desired end-size, it might over-create the number of samples so that it can reduce some name/area pairs to bring the total down. Let's say that you want to end up with a total of 50 rows: final_size <- 50 For completeness, here are the sets from which we'll choose: avail_names <- c("Adam", "John", "Henry", "Mike") avail_areas <- c("run", "develop", "test") and the minimum we need to create for Adam,run (etc) in order to certainly end up with no less than final_size rows: size_per_namearea <- ceiling(final_size / (length(avail_names) * length(avail_areas))) Ok, generate at least as many (likely more than) the number of rows we need: set.seed(20180920) qcsample <- crossing(data_frame(rownum = seq_len(size_per_namearea)), data_frame(name = avail_names), data_frame(area = avail_areas)) %>% group_by(name, area) %>% mutate(id = sample(100, size = n(), replace = FALSE)) qcsample # # A tibble: 60 x 4 # # Groups: name, area [12] # rownum name area id # <int> <chr> <chr> <int> # 1 1 Adam run 59 # 2 1 Adam develop 51 # 3 1 Adam test 23 # 4 1 John run 71 # 5 1 John develop 5 # 6 1 John test 24 # 7 1 Henry run 4 # 8 1 Henry develop 29 # 9 1 Henry test 79 # 10 1 Mike run 77 # # ... with 50 more rows Verify we have identical sample sizes for each name/area: xtabs(~ name + area, data = qcsample) %>% stats::addmargins() # area # name develop run test Sum # Adam 5 5 5 15 # Henry 5 5 5 15 # John 5 5 5 15 # Mike 5 5 5 15 # Sum 20 20 20 60 If we just do head(final_size), then we know which names we will be cutting short, which undermines the randomness of your sampling a little. The reason I added rownum up front was so that I can arrange by it plus a jitter, ensuring I get all of max(rownum)-1, and then some sampling of max(rownum), guaranteeing that each name/area pair have either max(rownum)-1 or max(rownum) rows; your tallies are never different by more than 1. reducedsample <- arrange(qcsample, rownum + runif(n())) %>% head(final_size) %>% select(-rownum) reducedsample %>% xtabs(~ name + area, data = .) %>% stats::addmargins() # area # name develop run test Sum # Adam 4 4 5 13 # Henry 5 4 4 13 # John 4 4 4 12 # Mike 4 4 4 12 # Sum 17 16 17 50
R: Splitting dataset by pre-determined values
I have data that looks like this (but larger): Pos Value 0 66.81967 1 66.36885 2 65.79508 3 65.27049 4 64.88525 5 64.97541 6 65.39344 7 65.99181 8 66.63115 9 66.95901 10 66.89344 11 66.44262 12 65.90984 13 65.49181 14 65.35246 I have already determined the maxima and saved the position values of each to a vector like so: 9 19 30 42 56 69 80 92 107 118 130 143 154 164 176 188 199 211 222 234 245 I now want to split the data based on the value of the maxima, so for the sample data I'd want to split the dataset into the values for Positions 0->9 and into the values for Positions 10-15, and save each of these sub-sets into vectors of their own. I'm new to R (and coding) and was wondering how to best go about this.
Suppose your data frame is dat and your maxima values are in a vector maxima, you might use split(dat, cut(dat$Pos, breaks = maxima, include.lowest = TRUE)) For your example data frame: dat <- structure(list(Pos = 0:14, Value = c(66.81967, 66.36885, 65.79508, 65.27049, 64.88525, 64.97541, 65.39344, 65.99181, 66.63115, 66.95901, 66.89344, 66.44262, 65.90984, 65.49181, 65.35246)), .Names = c("Pos", "Value"), class = "data.frame", row.names = c(NA, -15L)) and the first few values of your maxima in the range: maxima <- c(0, 10, 19) my code gives you a list of data frames #$`[0,10]` # Pos Value #1 0 66.81967 #2 1 66.36885 #3 2 65.79508 #4 3 65.27049 #5 4 64.88525 #6 5 64.97541 #7 6 65.39344 #8 7 65.99181 #9 8 66.63115 #10 9 66.95901 #11 10 66.89344 # #$`(10,19]` # Pos Value #12 11 66.44262 #13 12 65.90984 #14 13 65.49181 #15 14 65.35246 If you don't want data frames, but just Value, use split(dat$Value, cut(dat$Pos, breaks = maxima, include.lowest = TRUE)) #$`[0,10]` # [1] 66.81967 66.36885 65.79508 65.27049 64.88525 64.97541 65.39344 65.99181 # [9] 66.63115 66.95901 66.89344 # #$`(10,19]` # [1] 66.44262 65.90984 65.49181 65.35246 Thanks! How would I go about saving these as separate data frames/sets (not sure on the correct terminology) so that I can then fit them individually? How about lst <- split(dat, cut(dat$Pos, breaks = maxima, include.lowest = TRUE)) dir <- getwd() lapply(seq_len(length(lst)), function (i) write.csv(lst[[i]], file = paste0(dir,"/",names(lst[i]), ".csv"), row.names = FALSE)) This will save each data frame into a .csv file under directory dir. I have used getwd() to test the code; you may change it to a specific folder.
Not sure if that is the best approach, but I would work with a list and use a for loop like this (untested): maxpos <- c(9, 19, 30) ans <- list() prev <- 1 for (i in seq.int(length(maxpos))) { ans[[i]] <- dataset[seq(prev, maxpos[i]),] prev <- maxpos[i+1] } ans[[length(maxpos)+1]] <- dataset[seq(maxpos[length[maxpos]]+1,nrow(dataset)),]
R - Create a new variable where each observation depends on another table and other variables in the data frame
I have the two following tables: df <- data.frame(eth = c("A","B","B","A","C"),ZIP1 = c(1,1,2,3,5)) Inc <- data.frame(ZIP2 = c(1,2,3,4,5,6,7),A = c(56,98,43,4,90,19,59), B = c(49,10,69,30,10,4,95),C = c(69,2,59,8,17,84,30)) eth ZIP1 ZIP2 A B C A 1 1 56 49 69 B 1 2 98 10 2 B 2 3 43 69 59 A 3 4 4 30 8 C 5 5 90 10 17 6 19 4 84 7 59 95 39 I would like to create a variable Inc in the df data frame where for each observation, the value is the intersection of the eth and ZIP of the observation. In my example, it would lead to: eth ZIP1 Inc A 1 56 B 1 49 B 2 10 A 3 43 C 5 17 A loop or quite brute force could solve it but it takes time on my dataset, I'm looking for a more subtle way maybe using data.table. It seems to me that it is a very standard question and I'm apologizing if it is, my unability to formulate a precise title for this problem (as you may have noticed..) is maybe why I haven't found any similar question in searching on the forum.. Thanks !
Sure, it can be done in data.table: library(data.table) setDT(df) df[ melt(Inc, id.var="ZIP2", variable.name="eth", value.name="Inc"), Inc := i.Inc , on=c(ZIP1 = "ZIP2","eth") ] The syntax for this "merge-assign" operation is X[i, Xcol := expression, on=merge_cols]. You can run the i = melt(Inc, id.var="ZIP", variable.name="eth", value.name="Inc") part on its own to see how it works. Inside the merge, columns from i can be referred to with i.* prefixes. Alternately... setDT(df) setDT(Inc) df[, Inc := Inc[.(ZIP1), eth, on="ZIP2", with=FALSE], by=eth] This is built on a similar idea. The package vignettes are a good place to start for this sort of syntax.
We can use row/column indexing df$Inc <- Inc[cbind(match(df$ZIP1, Inc$ZIP2), match(df$eth, colnames(Inc)))] df # eth ZIP1 Inc #1 A 1 56 #2 B 1 49 #3 B 2 10 #4 A 3 43 #5 C 5 17
What about this? library(reshape2) merge(df, melt(Inc, id="ZIP2"), by.x = c("ZIP1", "eth"), by.y = c("ZIP2", "variable")) ZIP1 eth value 1 1 A 56 2 1 B 49 3 2 B 10 4 3 A 43 5 5 C 17
Another option: library(dplyr) library(tidyr) Inc %>% gather(eth, value, -ZIP2) %>% left_join(df, ., by = c("eth", "ZIP1" = "ZIP2"))
my solution(which maybe seems awkward) for (i in 1:length(df$eth)) { df$Inc[i] <- Inc[as.character(df$eth[i])][df$ZIP[i],] }
Adding additional observation in panel data in R
I am trying to add additional years to my panel data. Just wondering if you guys have any ideas of quick way of doing it. Keep in mind my real data is T=6, i=4000. # Here is my input data = data.frame(time=c(30,40,50,30,40,50,30,40,50), id=c(1,1,1,2,2,2,3,3,3), d=c(1,4,7,8,14,2,41,11,61)) # declare panel data individ and time pd = pdata.frame(data, c("id","time"), drop.index=FALSE) #this is what I want out... data.out = data.frame(time=c(30,40,50,60,30,40,50,60,30,40,50,60), id=c(1,1,1,1,2,2,2,2,3,3,3,3), d=c(1,4,7,8,9,14,2,41,50,11,61,70)) # declare panel data individ and time pd.data.out = pdata.frame(data.out, c("id","time"), drop.index=FALSE)
I am not quite sure what you are doing but this might help: data = data.frame(time=c(30,40,50,30,40,50,30,40,50), id=c(1,1,1,2,2,2,3,3,3), d=c(1,4,7,8,14,2,41,11,61)) newdata = data.frame(time=c(60,60,60), id=c(1,2,3), d=c(9,50,70)) combodata = rbind(data,newdata) data.out = combodata[order(combodata$id,combodata$time), ] rownames(data.out) = NULL to produce > data.out time id d 1 30 1 1 2 40 1 4 3 50 1 7 4 60 1 9 5 30 2 8 6 40 2 14 7 50 2 2 8 60 2 50 9 30 3 41 10 40 3 11 11 50 3 61 12 60 3 70 and I think this is what you want for time and id, though d is marginally different. If the rows do not need to be ordered then the last three lines of the code can be condensed to data.out = rbind(data,newdata)
Got it... just create new time and id data.frame and merge into it. time = rep(c(unique(as.numeric(as.character(pd$time))),max(as.numeric(as.character(pd$time)))+10), length(unique(pd$id))) id = rep( unique(pd$id), each=max(as.numeric(as.character(pd$id)))+1) data2 = data.frame(time, id) data.out = merge(data2, pd, all.x=T) data.out = data.out[with(data.out, order(id,time) ), ]