Wide to long with many different columns - r

I have used pivot_longer before but this time I have a much more complex wide dataframe and I cannot sort it out. The example code will provide you a reproducible dataframe. I haven't dealt with such thing before so I'm not sure it's correct to try to format this type of df in long format?
df <- data.frame(
ID = as.numeric(c("7","8","10","11","13","15","16")),
AGE = as.character(c("45 – 54","25 – 34","25 – 34","25 – 34","25 – 34","18 – 24","35 – 44")),
GENDER = as.character(c("Female","Female","Male","Female","Other","Male","Female")),
SD = as.numeric(c("3","0","0","0","3","2","0")),
GAMING = as.numeric(c("0","0","0","0","2","2","0")),
HW = as.numeric(c("2","2","0","2","2","2","2")),
R1_1 = as.numeric(c("10","34","69","53","79","55","28")),
M1_1 = as.numeric(c("65","32","64","53","87","55","27")),
P1_1 = as.numeric(c("65","38","67","54","88","44","26")),
R1_2 = as.numeric(c("15","57","37","54","75","91","37")),
M1_2 = as.numeric(c("90","26","42","56","74","90","37")),
P1_2 = as.numeric(c("90","44","33","54","79","95","37")),
R1_3 = as.numeric(c("5","47","80","27","61","19","57")),
M1_3 = as.numeric(c("30","71","80","34","71","15","57")),
P1_3 = as.numeric(c("30","36","81","35","62","8","56")),
R2_1 = as.numeric(c("10","39","75","31","71","80","59")),
M2_1 = as.numeric(c("90","51","74","15","70","75","61")),
P2_1 = as.numeric(c("90","52","35","34","69","83","60")),
R2_2 = as.numeric(c("10","45","31","54","39","95","77")),
M2_2 = as.numeric(c("60","70","40","78","5","97","75")),
P2_2 = as.numeric(c("60","40","41","58","9","97","76")),
R2_3 = as.numeric(c("5","38","78","45","25","16","22")),
M2_3 = as.numeric(c("30","34","84","62","33","52","20")),
P2_3 = as.numeric(c("30","34","82","45","32","16","22")),
R3_1 = as.numeric(c("10","40","41","42","62","89","41")),
M3_1 = as.numeric(c("90","67","37","40","27","89","42")),
P3_1 = as.numeric(c("90","34","51","44","38","84","43")),
R3_2 = as.numeric(c("10","37","20","54","8","93","69")),
M3_2 = as.numeric(c("60","38","21","62","5","95","71")),
P3_2 = as.numeric(c("60","38","23","65","14","92","69")),
R3_3 = as.numeric(c("5","30","62","11","60","32","52")),
M3_3 = as.numeric(c("30","67","34","55","45","25","45")),
P3_3 = as.numeric(c("30","28","41","24","53","23","52")),
R1_4 = as.numeric(c("10","40","61","17","39","72","25")),
M1_4 = as.numeric(c("45","20","63","25","62","70","23")),
P1_4 = as.numeric(c("45","52","56","16","26","72","27")),
R2_4 = as.numeric(c("5","21","70","33","80","68","30")),
M2_4 = as.numeric(c("35","21","69","27","85","69","23")),
P2_4 = as.numeric(c("35","32","34","25","79","63","29")),
R3_4 = as.numeric(c("10","29","68","21","8","71","41")),
M3_4 = as.numeric(c("50","37","66","28","33","65","41")),
P3_4 = as.numeric(c("50","38","47","28","24","71","41"))
)
I would like to sort it out like in the following table
the new column names are extracted from the old ones such that (example) in R1_1:
R is the namer of the column containing the value previously stored
in R1_1
1 (the first character after 'R' in R1_1) is the value used
in column Speed
1 (last character of 'R1_1') is the value used in
column Sound
basically each row corresponds to 1 question answered by 1 person, and each question was answered through 3 different ratings (R, M, P)
thank you!

If I understood you correctly, the following should work:
df %>%
pivot_longer(
cols = matches('[RMP]\\d_\\d'),
names_to = c('RMP', 'Speed', 'Sound'),
values_to = 'Data',
names_pattern = '([RMP])(\\d)_(\\d)'
) %>%
pivot_wider(names_from = RMP, values_from = Data)
This assumes that both “speed” and “sound” are single-digit values. If there’s the possibility of multiple digits, the occurrences of \\d in the patterns above need to be replaced by \\d+.

Solution using our good ol' workhorse reshape. At first we grep the names with a "Wd_d" pattern, as well as their suffixes "d_d" for following use in reshape.
nm <- names(df[grep("_\\d", names(df))])
times <- unique(substr(nm, 2, 4))
res <- reshape(df, idvar="ID", varying=7:42, v.names=unique(substr(nm, 1, 1)),
times=times,direction="long")
Getting us close to the result, we just need to strsplit the newly created "time" variable at the "_" and rbind it to the former.
res <- cbind(res, setNames(type.convert(do.call(rbind.data.frame,
strsplit(res$time, "_"))),
c("Speed", "Sound")))
res <- res[order(res$AGE), ] ## some ordering
Result
head(res)
# ID AGE GENDER SD GAMING HW time R M P Speed Sound
# 15.1_1 15 18 – 24 Male 2 2 2 1_1 55 44 55 1 1
# 15.1_2 15 18 – 24 Male 2 2 2 1_2 90 95 91 1 2
# 15.1_3 15 18 – 24 Male 2 2 2 1_3 15 8 19 1 3
# 15.2_1 15 18 – 24 Male 2 2 2 2_1 75 83 80 2 1
# 15.2_2 15 18 – 24 Male 2 2 2 2_2 97 97 95 2 2
# 15.2_3 15 18 – 24 Male 2 2 2 2_3 52 16 16 2 3

Related

R How to count the number of date events in a date range contingent on an ID code

I'm trying to figure out how I can make R count the number of df$event_dt by each individual df$id in the episode.start - episode.end range for that id in All_date_events that AdhereR produces.
Essentially I want to find the sum of events for that given ID in the date range.
After running this code you'll have 2 data frames.
df, transformed from original df with 4 columns is the input data
All_date_events is the output I'd like to add a event.num column to.
library(dplyr)
library(tidyverse)
library(AdhereR)
library(hablar)
library(lubridate)
# Starting data
df <- data.frame(id = c("1","1","2","2","1","1","1"),
event_dt = dmy('2-1-2012',
'2-4-2012',
'2-5-2012',
'2-7-2012',
'2-12-2012',
'1-3-2013',
'22-5-2013'))
df <- df %>% mutate(ones = "1", class = "N1")
df <- transform(df,ones = as.numeric(ones)))
All_date_events <- compute.treatment.episodes(df,
ID.colname="id",
event.date.colname="event_dt",
event.duration.colname="ones",
event.daily.dose.colname="ones",
medication.class.colname="class",
carryover.within.obs.window = TRUE, # carry-over into the OW
carry.only.for.same.medication = TRUE, # & only for same type
consider.dosage.change = TRUE, # dosage change starts new episode...
medication.change.means.new.treatment.episode = TRUE, # & type change
maximum.permissible.gap = 90, # & a gap longer than 180 days
maximum.permissible.gap.unit = "days", # unit for the above (days)
followup.window.start = 0, # 2-years FUW starts at earliest event
followup.window.start.unit = "days",
followup.window.duration = 365*2,
followup.window.duration.unit = "days",
date.format = "%d/%m/%Y");
Either of these results is what I'm hoping for
id
episode.ID
episode.start
end.episode.gap.days
episode.duration
episode.end
event.num
1
1
2012-01-02
243
92
2012-04-03
2
1
2
2012-12-02
223
172
2013-05-23
3
2
1
2012-05-02
668
62
2012-07-03
2
Id.code
Event.date
Event.start
Event.end
Events.num
Event.Episode
1
2.1.2012
1
2.4.2012
2.1.2012
2.4.2012
2
1
2
2.5.2012
2
2.7.2012
2.5.2012
2.7.2012
2
1
1
2.12.2012
1
21.3.2013
1
22.5.2013
2.5.2012
22.5.2013
3
2

R sampling with if statement and similar number of sample

I need to to create a sample from my dataframe and to do so I am using the code bellow.
name <- sample(c("Adam","John","Henry","Mike"),100,rep = TRUE)
area <- sample(c("run","develop","test"),100,rep = TRUE)
id <- sample(100:200,100,rep = FALSE)
mydata <- as.data.frame(cbind(id,area,name))
qcsample <- mydata %>%
group_by(area) %>%
nest() %>%
mutate(n = c(20, 15, 15)) %>%
mutate(samp = map2(data, n, sample_n)) %>%
select(area, samp) %>%
unnest()
Now, I am getting these results.
table(qcsample$area)
develop run test
15 15 20
--
table(qcsample$name)
Adam Henry John Mike
9 9 16 16
I would like to create a sample that would have more or less the same number of samples for each name eg. Adam - 12, Henry - 12, John - 13, Mike - 13.
How can I achieve that ? can I somehow request that the sample is equally distributed ?
Also, in this example I used function
sample_n
and specified number of samples.
I am anticipating that sometimes there will not be required number from a given group. In my example I am taking 20 samples from area called "test" but sometimes there will be only let's say 10 rows containing "test". The total number is 50 so I need to make sure if there are only 10 "test" the code has to automatically increase the others, so the sample would be "test" - 10, "run" - 20 and "develop" - 20. This can happen to any of the area so I need to test if there is enough rows to create the sample and increase other areas. If there is only 1 it can be added to any of the remaining areas or if the difference is 3 we add 1 to one area and 2 to the another one.
How could I check that taking into account all the possibilities ? I believe there are eight permutations in this case.
Thanks in advance A.
If you are using made up data then you can create a minimum amount of each row and then create filler to get you up to the total:
set.seed(42)
names <- c("Adam", "John", "Henry", "Mike")
areas <- c("run", "develop", "test")
totalrows <- 100
minname <- 22 # No less than 20 of each name (set to near threshold to test)
minarea <- 30 # No less than 30 of each area (less randomness the higher these are)
qcsample <- data.frame(
name=sample(c(rep(names, minname), sample(names, totalrows-length(names)*minname, replace=T))),
area=sample(c(rep(areas, minarea), sample(areas, totalrows-length(areas)*minarea, replace=T))),
id=sample(99+(1:totalrows))
)
This results in:
R> table(qcsample$name)
Adam Henry John Mike
23 28 24 25
R> table(qcsample$area)
develop run test
37 31 32
Notice that the count of name to area isn't constrained:
R> table(qcsample[,-3])
area
name develop run test
Adam 5 11 7
Henry 11 8 9
John 10 7 7
Mike 11 5 9
R>
Using a loop as suggested by #r2evans:
library(dplyr)
set.seed(42)
mydata <- data.frame(
name = sample(c("Adam","John","Henry","Mike"), 100, rep = TRUE),
area = sample(c("run","develop","test"), 100, rep = TRUE),
id = sample(100:200, 100, rep = FALSE)
)
Nsamples <- 50
mysample <- data.frame(sample_n(mydata, Nsamples))
minname <- 11 # max is 50/4 -> 12
minarea <- 15 # max is 50/3 -> 16
# the test you were asking about
while( (min(table(mysample$name)) < minname) || (min(table(mysample$area)) < minarea) ) {
mysample <- data.frame(sample_n(mydata, Nsamples))
}
This results in:
R> table(mysample$name)
Adam Henry John Mike
13 15 11 11
R> table(mysample$area)
develop run test
15 17 18
And, like before, there's no minimum of name to area.
R> table(mysample[-3])
area
name develop run test
Adam 4 3 6
Henry 2 6 7
John 4 4 3
Mike 5 4 2
If you needed to enforce an minimum number for each permutation add this to the test:
while(... || (min(table(mysample[-3])) < some_min)) {
BTW, the number of permutations, as you can see by the table, is the number of names times the number of areas.
Here's another thought.
Depending on your desired end-size, it might over-create the number of samples so that it can reduce some name/area pairs to bring the total down.
Let's say that you want to end up with a total of 50 rows:
final_size <- 50
For completeness, here are the sets from which we'll choose:
avail_names <- c("Adam", "John", "Henry", "Mike")
avail_areas <- c("run", "develop", "test")
and the minimum we need to create for Adam,run (etc) in order to certainly end up with no less than final_size rows:
size_per_namearea <- ceiling(final_size / (length(avail_names) * length(avail_areas)))
Ok, generate at least as many (likely more than) the number of rows we need:
set.seed(20180920)
qcsample <- crossing(data_frame(rownum = seq_len(size_per_namearea)),
data_frame(name = avail_names),
data_frame(area = avail_areas)) %>%
group_by(name, area) %>%
mutate(id = sample(100, size = n(), replace = FALSE))
qcsample
# # A tibble: 60 x 4
# # Groups: name, area [12]
# rownum name area id
# <int> <chr> <chr> <int>
# 1 1 Adam run 59
# 2 1 Adam develop 51
# 3 1 Adam test 23
# 4 1 John run 71
# 5 1 John develop 5
# 6 1 John test 24
# 7 1 Henry run 4
# 8 1 Henry develop 29
# 9 1 Henry test 79
# 10 1 Mike run 77
# # ... with 50 more rows
Verify we have identical sample sizes for each name/area:
xtabs(~ name + area, data = qcsample) %>%
stats::addmargins()
# area
# name develop run test Sum
# Adam 5 5 5 15
# Henry 5 5 5 15
# John 5 5 5 15
# Mike 5 5 5 15
# Sum 20 20 20 60
If we just do head(final_size), then we know which names we will be cutting short, which undermines the randomness of your sampling a little. The reason I added rownum up front was so that I can arrange by it plus a jitter, ensuring I get all of max(rownum)-1, and then some sampling of max(rownum), guaranteeing that each name/area pair have either max(rownum)-1 or max(rownum) rows; your tallies are never different by more than 1.
reducedsample <- arrange(qcsample, rownum + runif(n())) %>%
head(final_size) %>%
select(-rownum)
reducedsample %>%
xtabs(~ name + area, data = .) %>%
stats::addmargins()
# area
# name develop run test Sum
# Adam 4 4 5 13
# Henry 5 4 4 13
# John 4 4 4 12
# Mike 4 4 4 12
# Sum 17 16 17 50

R: Splitting dataset by pre-determined values

I have data that looks like this (but larger):
Pos Value
0 66.81967
1 66.36885
2 65.79508
3 65.27049
4 64.88525
5 64.97541
6 65.39344
7 65.99181
8 66.63115
9 66.95901
10 66.89344
11 66.44262
12 65.90984
13 65.49181
14 65.35246
I have already determined the maxima and saved the position values of each to a vector like so:
9 19 30 42 56 69 80 92 107 118 130 143 154 164 176 188 199 211
222 234 245
I now want to split the data based on the value of the maxima, so for the sample data I'd want to split the dataset into the values for Positions 0->9 and into the values for Positions 10-15, and save each of these sub-sets into vectors of their own.
I'm new to R (and coding) and was wondering how to best go about this.
Suppose your data frame is dat and your maxima values are in a vector maxima, you might use
split(dat, cut(dat$Pos, breaks = maxima, include.lowest = TRUE))
For your example data frame:
dat <-
structure(list(Pos = 0:14, Value = c(66.81967, 66.36885, 65.79508,
65.27049, 64.88525, 64.97541, 65.39344, 65.99181, 66.63115, 66.95901,
66.89344, 66.44262, 65.90984, 65.49181, 65.35246)), .Names = c("Pos",
"Value"), class = "data.frame", row.names = c(NA, -15L))
and the first few values of your maxima in the range:
maxima <- c(0, 10, 19)
my code gives you a list of data frames
#$`[0,10]`
# Pos Value
#1 0 66.81967
#2 1 66.36885
#3 2 65.79508
#4 3 65.27049
#5 4 64.88525
#6 5 64.97541
#7 6 65.39344
#8 7 65.99181
#9 8 66.63115
#10 9 66.95901
#11 10 66.89344
#
#$`(10,19]`
# Pos Value
#12 11 66.44262
#13 12 65.90984
#14 13 65.49181
#15 14 65.35246
If you don't want data frames, but just Value, use
split(dat$Value, cut(dat$Pos, breaks = maxima, include.lowest = TRUE))
#$`[0,10]`
# [1] 66.81967 66.36885 65.79508 65.27049 64.88525 64.97541 65.39344 65.99181
# [9] 66.63115 66.95901 66.89344
#
#$`(10,19]`
# [1] 66.44262 65.90984 65.49181 65.35246
Thanks! How would I go about saving these as separate data frames/sets (not sure on the correct terminology) so that I can then fit them individually?
How about
lst <- split(dat, cut(dat$Pos, breaks = maxima, include.lowest = TRUE))
dir <- getwd()
lapply(seq_len(length(lst)),
function (i) write.csv(lst[[i]], file = paste0(dir,"/",names(lst[i]), ".csv"), row.names = FALSE))
This will save each data frame into a .csv file under directory dir. I have used getwd() to test the code; you may change it to a specific folder.
Not sure if that is the best approach, but I would work with a list and use a for loop like this (untested):
maxpos <- c(9, 19, 30)
ans <- list()
prev <- 1
for (i in seq.int(length(maxpos))) {
ans[[i]] <- dataset[seq(prev, maxpos[i]),]
prev <- maxpos[i+1]
}
ans[[length(maxpos)+1]] <- dataset[seq(maxpos[length[maxpos]]+1,nrow(dataset)),]

R - Create a new variable where each observation depends on another table and other variables in the data frame

I have the two following tables:
df <- data.frame(eth = c("A","B","B","A","C"),ZIP1 = c(1,1,2,3,5))
Inc <- data.frame(ZIP2 = c(1,2,3,4,5,6,7),A = c(56,98,43,4,90,19,59), B = c(49,10,69,30,10,4,95),C = c(69,2,59,8,17,84,30))
eth ZIP1 ZIP2 A B C
A 1 1 56 49 69
B 1 2 98 10 2
B 2 3 43 69 59
A 3 4 4 30 8
C 5 5 90 10 17
6 19 4 84
7 59 95 39
I would like to create a variable Inc in the df data frame where for each observation, the value is the intersection of the eth and ZIP of the observation. In my example, it would lead to:
eth ZIP1 Inc
A 1 56
B 1 49
B 2 10
A 3 43
C 5 17
A loop or quite brute force could solve it but it takes time on my dataset, I'm looking for a more subtle way maybe using data.table. It seems to me that it is a very standard question and I'm apologizing if it is, my unability to formulate a precise title for this problem (as you may have noticed..) is maybe why I haven't found any similar question in searching on the forum..
Thanks !
Sure, it can be done in data.table:
library(data.table)
setDT(df)
df[ melt(Inc, id.var="ZIP2", variable.name="eth", value.name="Inc"),
Inc := i.Inc
, on=c(ZIP1 = "ZIP2","eth") ]
The syntax for this "merge-assign" operation is X[i, Xcol := expression, on=merge_cols].
You can run the i = melt(Inc, id.var="ZIP", variable.name="eth", value.name="Inc") part on its own to see how it works. Inside the merge, columns from i can be referred to with i.* prefixes.
Alternately...
setDT(df)
setDT(Inc)
df[, Inc := Inc[.(ZIP1), eth, on="ZIP2", with=FALSE], by=eth]
This is built on a similar idea. The package vignettes are a good place to start for this sort of syntax.
We can use row/column indexing
df$Inc <- Inc[cbind(match(df$ZIP1, Inc$ZIP2), match(df$eth, colnames(Inc)))]
df
# eth ZIP1 Inc
#1 A 1 56
#2 B 1 49
#3 B 2 10
#4 A 3 43
#5 C 5 17
What about this?
library(reshape2)
merge(df, melt(Inc, id="ZIP2"), by.x = c("ZIP1", "eth"), by.y = c("ZIP2", "variable"))
ZIP1 eth value
1 1 A 56
2 1 B 49
3 2 B 10
4 3 A 43
5 5 C 17
Another option:
library(dplyr)
library(tidyr)
Inc %>%
gather(eth, value, -ZIP2) %>%
left_join(df, ., by = c("eth", "ZIP1" = "ZIP2"))
my solution(which maybe seems awkward)
for (i in 1:length(df$eth)) {
df$Inc[i] <- Inc[as.character(df$eth[i])][df$ZIP[i],]
}

Adding additional observation in panel data in R

I am trying to add additional years to my panel data. Just wondering if you guys have any ideas of quick way of doing it. Keep in mind my real data is T=6, i=4000.
# Here is my input
data = data.frame(time=c(30,40,50,30,40,50,30,40,50),
id=c(1,1,1,2,2,2,3,3,3),
d=c(1,4,7,8,14,2,41,11,61))
# declare panel data individ and time
pd = pdata.frame(data, c("id","time"), drop.index=FALSE)
#this is what I want out...
data.out = data.frame(time=c(30,40,50,60,30,40,50,60,30,40,50,60),
id=c(1,1,1,1,2,2,2,2,3,3,3,3),
d=c(1,4,7,8,9,14,2,41,50,11,61,70))
# declare panel data individ and time
pd.data.out = pdata.frame(data.out, c("id","time"), drop.index=FALSE)
I am not quite sure what you are doing but this might help:
data = data.frame(time=c(30,40,50,30,40,50,30,40,50),
id=c(1,1,1,2,2,2,3,3,3),
d=c(1,4,7,8,14,2,41,11,61))
newdata = data.frame(time=c(60,60,60),
id=c(1,2,3),
d=c(9,50,70))
combodata = rbind(data,newdata)
data.out = combodata[order(combodata$id,combodata$time), ]
rownames(data.out) = NULL
to produce
> data.out
time id d
1 30 1 1
2 40 1 4
3 50 1 7
4 60 1 9
5 30 2 8
6 40 2 14
7 50 2 2
8 60 2 50
9 30 3 41
10 40 3 11
11 50 3 61
12 60 3 70
and I think this is what you want for time and id, though d is marginally different. If the rows do not need to be ordered then the last three lines of the code can be condensed to
data.out = rbind(data,newdata)
Got it... just create new time and id data.frame and merge into it.
time = rep(c(unique(as.numeric(as.character(pd$time))),max(as.numeric(as.character(pd$time)))+10), length(unique(pd$id)))
id = rep( unique(pd$id), each=max(as.numeric(as.character(pd$id)))+1)
data2 = data.frame(time, id)
data.out = merge(data2, pd, all.x=T)
data.out = data.out[with(data.out, order(id,time) ), ]

Resources