I have data as follows:
dat <- structure(list(ZIP_source1 = c(1026, 1026, 1026, 1026, 1026,
1026, 1026, 1026, 1026, 1026, 1017, 1012, 1012), ZIP_source2 = c(1026,
1026, 1026, 1026, 1026, 1026, NA, NA, NA, NA, NA, 1012, 1012),
Category_source2 = c(4, 4, 4, 4, 4, 4, NA, NA, NA, NA, NA, 4, 4)), class = c("data.table",
"data.frame"), row.names = c(NA, -13L))
dat
ZIP_source1 ZIP_source2 Category_source2
1: 1016 1016 4
2: 1016 1016 4
3: 1016 1016 4
4: 1016 1016 4
5: 1016 1016 4
6: 1016 1016 4
7: 1016 NA NA
8: 1016 NA NA
9: 1016 NA NA
10: 1016 NA NA
11: 1027 NA NA
12: 1022 1022 4
13: 1022 1022 4
For line 7 to 10, I know from source 1 what the zip code is. From source 2 I know that this zip code
falls in category 4. What is the best way to do this?
Desired output:
ZIP_source1 ZIP_source2 Category_source2
1: 1016 1016 4
2: 1016 1016 4
3: 1016 1016 4
4: 1016 1016 4
5: 1016 1016 4
6: 1016 1016 4
7: 1016 NA 4
8: 1016 NA 4
9: 1016 NA 4
10: 1016 NA 4
11: 1027 NA NA
12: 1022 1022 4
13: 1022 1022 4
We can use fill
library(dplyr)
library(tidyr)
dat %>%
group_by(ZIP_source1) %>%
fill(Category_source2, .direction = "downup")
Or using nafill
library(data.table)
dat[, Category_source2 := nafill(nafill(Category_source2,
type = "locf"), type = "nocb"), ZIP_source1]
-output
> dat
ZIP_source1 ZIP_source2 Category_source2
<num> <num> <num>
1: 1026 1026 4
2: 1026 1026 4
3: 1026 1026 4
4: 1026 1026 4
5: 1026 1026 4
6: 1026 1026 4
7: 1026 NA 4
8: 1026 NA 4
9: 1026 NA 4
10: 1026 NA 4
11: 1017 NA NA
12: 1012 1012 4
13: 1012 1012 4
I'd prefer to create new columns to do this, which I will call zip and category, but it's straightforward to overwrite the original columns if you want.
# Get all zips where not NA in one column
dat <- dat %>%
mutate(
zip = coalesce(ZIP_source1, ZIP_source2)
)
# Create table of all categories
category_table <- dat %>%
select(Category_source2, zip) %>%
drop_na() %>%
group_by(zip) %>%
distinct() %>%
rename(category = Category_source2)
category_table
# category zip
# <dbl> <dbl>
# 1 4 1026
# 2 4 1012
# Join as new column
left_join(dat, category_table, by = "zip")
# left_join(dat, category_table, by = "zip")
# ZIP_source1 ZIP_source2 Category_source2 zip category
# 1 1026 1026 4 1026 4
# 2 1026 1026 4 1026 4
# 3 1026 1026 4 1026 4
# 4 1026 1026 4 1026 4
# 5 1026 1026 4 1026 4
# 6 1026 1026 4 1026 4
# 7 1026 NA NA 1026 4
# 8 1026 NA NA 1026 4
# 9 1026 NA NA 1026 4
# 10 1026 NA NA 1026 4
# 11 1017 NA NA 1017 NA
# 12 1012 1012 4 1012 4
# 13 1012 1012 4 1012 4
Related
I used this method to gather mean and sd result successly before here .And then, I tried to use this methond to gather my gene counts DEG data with "logFC","cil","cir","ajustP_value" .But I failed because something wrong with my result.
Just like this:
data_1<-data.frame(matrix(sample(1:1200,1200,replace = T),48,25))
names(data_1) <- c(paste0("Gene_", 1:25))
rownames(data_1)<-NULL
head(data_1)
A<-paste0(1:48,"_logFC")
data_logFC<-data.frame(A=A,data_1)
#
data_2<-data.frame(matrix(sample(1:1200,1200,replace = T),48,25))
names(data_2) <- c(paste0("Gene_", 1:25))
rownames(data_1)<-NULL
B_L<-paste0(1:48,"_CI.L")
data_CIL<-data.frame(A=B_L,data_2)
data_CIL[1:48,1:6]
#
data_3<-data.frame(matrix(sample(1:1200,1200,replace = T),48,25))
names(data_3) <- c(paste0("Gene_", 1:25))
rownames(data_3)<-NULL
C_R<-paste0(1:48,"_CI.R")
data_CIR<-data.frame(A=C_R,data_3)
data_CIR[1:48,1:6]
#
data_4<-data.frame(matrix(sample(1:1200,1200,replace = T),48,25))
names(data_4) <- c(paste0("Gene_", 1:25))
rownames(data_4)<-NULL
D<-paste0(1:48,"_adj.P.Val")
data_ajustP<-data.frame(A=D,data_4)
data_ajustP[1:48,1:6]
# combine data_logFC data_CIL data_CIR data_ajustP
data <- bind_rows(list(
logFC = data_logFC,
CIL = data_CIL,
CIR =data_CIR,
AJSTP=data_ajustP
), .id = "stat")
data[1:10,1:6]
data_DEG<- data %>%
pivot_longer(-c(stat,A), names_to = "Gene", values_to = "value") %>%pivot_wider(names_from = "stat", values_from = "value")
head(data_DEG,100)
str(data_DEG$CIL)
> head(data_DEG,100)
# A tibble: 100 x 6
A Gene logFC CIL CIR AJSTP
<chr> <chr> <int> <int> <int> <int>
1 1_logFC Gene_1 504 NA NA NA
2 1_logFC Gene_2 100 NA NA NA
3 1_logFC Gene_3 689 NA NA NA
4 1_logFC Gene_4 779 NA NA NA
5 1_logFC Gene_5 397 NA NA NA
6 1_logFC Gene_6 1152 NA NA NA
7 1_logFC Gene_7 780 NA NA NA
8 1_logFC Gene_8 155 NA NA NA
9 1_logFC Gene_9 142 NA NA NA
10 1_logFC Gene_10 1150 NA NA NA
# … with 90 more rows
Why is there so many NAs ?
Can somebody help me ? Vary thankful.
EDITE:
I confused the real sample group of my data. So I reshape my data without a right index.
Here is my right method:
data[1:10,1:6]
data<-separate(data,A,c("Name","stat2"),"_")
data<-data[,-3]
data_DEG<- data %>%
pivot_longer(-c(stat,Name), names_to = "Gene", values_to = "value") %>%pivot_wider(names_from = "stat", values_from = "value")
head(data_DEG,10)
tail(data_DEG,10)
> head(data_DEG,10)
# A tibble: 10 x 6
Name Gene logFC CIL CIR AJSTP
<chr> <chr> <int> <int> <int> <int>
1 1 Gene_1 504 1116 774 278
2 1 Gene_2 100 936 448 887
3 1 Gene_3 689 189 718 933
4 1 Gene_4 779 943 690 19
5 1 Gene_5 397 976 40 135
6 1 Gene_6 1152 304 343 647
7 1 Gene_7 780 1076 796 1024
8 1 Gene_8 155 645 469 180
9 1 Gene_9 142 256 889 1047
10 1 Gene_10 1150 976 1194 670
> tail(data_DEG,10)
# A tibble: 10 x 6
Name Gene logFC CIL CIR AJSTP
<chr> <chr> <int> <int> <int> <int>
1 48 Gene_16 448 633 1080 1122
2 48 Gene_17 73 772 14 388
3 48 Gene_18 652 999 699 912
4 48 Gene_19 600 1163 512 241
5 48 Gene_20 428 1119 1142 348
6 48 Gene_21 66 553 240 82
7 48 Gene_22 753 1119 630 117
8 48 Gene_23 1017 305 1120 447
9 48 Gene_24 432 1175 447 670
10 48 Gene_25 482 394 371 696
It's a perfect result!!
This question already has answers here:
Reshape horizontal to to long format using pivot_longer
(3 answers)
Closed 2 years ago.
Thank you all for your answers, I thought I was smarter than I am and hoped I would've understood any of it. I think I messed up my visualisation of my data aswell. I have edited my post to better show my sample data. Sorry for the inconvenience, and I truly hope that someone can help me.
I have a question about reshaping my data. The data collected looks as such:
data <- read.table(header=T, text='
pid measurement1 Tdays1 measurement2 Tdays2 measurement3 Tdays3 measurment4 Tdays4
1 1356 1435 1483 1405 1563 1374 NA NA
2 943 1848 1173 1818 1300 1785 NA NA
3 1590 185 NA NA NA NA 1585 294
4 130 72 443 70 NA NA 136 79
4 140 82 NA NA NA NA 756 89
4 220 126 266 124 NA NA 703 128
4 166 159 213 156 476 145 776 166
4 380 189 583 173 NA NA 586 203
4 353 231 510 222 656 217 526 240
4 180 268 NA NA NA NA NA NA
4 NA NA NA NA NA NA 580 278
4 571 334 596 303 816 289 483 371
')
Now i would like it to look something like this:
PID Time Value
1 1435 1356
1 1405 1483
1 1374 1563
2 1848 943
2 1818 1173
2 1785 1300
3 185 1590
... ... ...
How would i tend to get there? I have looked up some things about wide to longformat, but it doesn't seem to do the trick. Am reletively new to Rstudio and Stackoverflow (if you couldn't tell that already).
Kind regards, and thank you in advance.
Here is a slightly different pivot_longer() version.
library(tidyr)
library(dplyr)
dw %>%
pivot_longer(cols = -PID, names_to =".value", names_pattern = "(.+)[0-9]")
# A tibble: 9 x 3
PID T measurement
<dbl> <dbl> <dbl>
1 1 1 100
2 1 4 200
3 1 7 50
4 2 2 150
5 2 5 300
6 2 8 60
7 3 3 120
8 3 6 210
9 3 9 70
The names_to = ".value" argument creates new columns from column names based on the names_pattern argument. The names_pattern argument takes a special regex input. In this case, here is the breakdown:
(.+) # match everything - anything noted like this becomes the ".values"
[0-9] # numeric characters - tells the pattern that the numbers
# at the end are excluded from ".values". If you have multiple digit
# numbers, use [0-9*]
In the last edit you asked for a solution that is easy to understand. A very simple approach would be to stack the measurement columns on top of each other and the Tdays columns on top of each other. Although specialty packages make things very concise and elegant, for simplicity we can solve this without additional packages. Standard R has a convenient function aptly named stack, which works like this:
> exp <- data.frame(value1 = 1:5, value2 = 6:10)
> stack(exp)
values ind
1 1 value1
2 2 value1
3 3 value1
4 4 value1
5 5 value1
6 6 value2
7 7 value2
8 8 value2
9 9 value2
10 10 value2
We can stack measurements and Tdays seperately and then combine them via cbind:
data <- read.table(header=T, text='
pid measurement1 Tdays1 measurement2 Tdays2 measurement3 Tdays3 measurement4 Tdays4
1 1356 1435 1483 1405 1563 1374 NA NA
2 943 1848 1173 1818 1300 1785 NA NA
3 1590 185 NA NA NA NA 1585 294
4 130 72 443 70 NA NA 136 79
4 140 82 NA NA NA NA 756 89
4 220 126 266 124 NA NA 703 128
4 166 159 213 156 476 145 776 166
4 380 189 583 173 NA NA 586 203
4 353 231 510 222 656 217 526 240
4 180 268 NA NA NA NA NA NA
4 NA NA NA NA NA NA 580 278
4 571 334 596 303 816 289 483 371
')
cbind(stack(data, c(measurement1, measurement2, measurement3, measurement4)),
stack(data, c(Tdays1, Tdays2, Tdays3, Tdays4)))
Which keeps measurements and Tdays neatly together but leaves us without pid which we can add using rep to replicate the original pid 4 times:
result <- cbind(pid = rep(data$pid, 4),
stack(data, c(measurement1, measurement2, measurement3, measurement4)),
stack(data, c(Tdays1, Tdays2, Tdays3, Tdays4)))
The head of which looks like
> head(result)
pid values ind values ind
1 1 1356 measurement1 1435 Tdays1
2 2 943 measurement1 1848 Tdays1
3 3 1590 measurement1 185 Tdays1
4 4 130 measurement1 72 Tdays1
5 4 140 measurement1 82 Tdays1
6 4 220 measurement1 126 Tdays1
As I said above, this is not the order you expected and you can try to sort this data.frame, if that is of any concern:
result <- result[order(result$pid), c(1, 4, 2)]
names(result) <- c("pid", "Time", "Value")
leading to the final result
> head(result)
pid Time Value
1 1 1435 1356
13 1 1405 1483
25 1 1374 1563
37 1 NA NA
2 2 1848 943
14 2 1818 1173
tidyverse solution
library(tidyverse)
dw %>%
pivot_longer(-PID) %>%
mutate(name = gsub('^([A-Za-z]+)(\\d+)$', '\\1_\\2', name )) %>%
separate(name, into = c('A', 'B'), sep = '_', convert = T) %>%
pivot_wider(names_from = A, values_from = value)
Gives the following output
# A tibble: 9 x 4
PID B T measurement
<int> <int> <int> <int>
1 1 1 1 100
2 1 2 4 200
3 1 3 7 50
4 2 1 2 150
5 2 2 5 300
6 2 3 8 60
7 3 1 3 120
8 3 2 6 210
9 3 3 9 70
Considering a dataframe, df like the following:
PID T1 measurement1 T2 measurement2 T3 measurement3
1 1 100 4 200 7 50
2 2 150 5 300 8 60
3 3 120 6 210 9 70
You can use this solution to get your required dataframe:
iters = seq(from = 4, to = length(colnames(df))-1, by = 2)
finalDf = df[, c(1,2,3)]
for(j in iters){
tobind = df[, c(1,j,j+1)]
finalDf = rbind(finalDf, tobind)
}
finalDf = finalDf[order(finalDf[,1]),]
print(finalDf)
The output of the print statement is this:
PID T1 measurement1
1 1 1 100
4 1 4 200
7 1 7 50
2 2 2 150
5 2 5 300
8 2 8 60
3 3 3 120
6 3 6 210
9 3 9 70
Maybe you can try reshape like below
reshape(
setNames(data, gsub("(\\d+)$", "\\.\\1", names(data))),
direction = "long",
varying = 2:ncol(data)
)
require(data.table)
set.seed(333)
t <- data.table(old=1002:2001, dif=sample(1:10,1000, replace=TRUE))
t$new <- t$old + t$dif; t$foo <- rnorm(1000); t$dif <- NULL
i <- data.table(id=1:3, start=sample(1000:1990,3))
> i
id start
1: 1 1002
2: 2 1744
3: 3 1656
> head(t)
old new foo
1: 1002 1007 -0.7889534
2: 1003 1004 0.3901869
3: 1004 1014 0.7907947
4: 1005 1011 2.0964612
5: 1006 1007 1.1834171
6: 1007 1015 1.1397910
I would like to delete time points from points such that only those rows remain where new[i] = old[i-1], giving a continuous sequence of some fixed number of time points. Ideally, this would be done for all id in i simultaneously, where start gives the starting points. For example, if we choose n=5, we should obtain
> head(ans)
id old new foo
1: 1 1002 1007 -0.7889534
2: 1 1007 1015 1.1397910
3: 1 1015 1022 -1.2193670
4: 1 1022 1024 1.2039050
5: 1 1024 1026 0.4388586
6: 2 1744 1750 -0.1368320
where lines 3 to 6 cannot be inferred above and foo is a stand in for other variables that need to be kept.
Can this be done efficiently in data.table, for example, using a clever combination of joins?
PS. This question is somewhat similar to an an earlier one of mine but I have modified the situation to make it clearer.
It seems to me that you need help from graph algorithms. If you want to start with 1002, you can try:
require(igraph)
g <- graph_from_edgelist(as.matrix(t[,1:2]))
t[old %in% subcomponent(g,"1002","out")]
# 1: 1002 1007 -0.78895338
# 2: 1007 1015 1.13979100
# 3: 1015 1022 -1.21936662
# 4: 1022 1024 1.20390482
# 5: 1024 1026 0.43885860
# ---
#191: 1981 1988 -0.22054875
#192: 1988 1989 -0.22812175
#193: 1989 1995 -0.04687776
#194: 1995 2000 2.41349730
#195: 2000 2002 -1.23425666
Of course you can do the above for each start you want and limiting the results for the first n rows. For instance, we can lapply over the i$start positions and then rbindlist all the values together, declaring an id column with the i$id values. Something like:
n <- 5
rbindlist(
setNames(lapply(i$start, function(x) t[old %in% subcomponent(g,x,"out")[1:n]]), i$id),
idcol="id")
# id old new foo
# 1: 1 1002 1007 -0.7889534
# 2: 1 1007 1015 1.1397910
# 3: 1 1015 1022 -1.2193666
# 4: 1 1022 1024 1.2039048
# 5: 1 1024 1026 0.4388586
# 6: 2 1744 1750 -0.1368320
# 7: 2 1750 1758 0.3331686
# 8: 2 1758 1763 1.3040357
# 9: 2 1763 1767 -1.1715528
#10: 2 1767 1775 0.2841251
#11: 3 1656 1659 -0.1556208
#12: 3 1659 1663 0.1663042
#13: 3 1663 1669 0.3781835
#14: 3 1669 1670 0.2760948
#15: 3 1670 1675 0.3745026
I'm trying to get a new dataset where it can take two columns and make a new table based on a calculation of a third column.
Cust T S1 S2 S3 S4
1009 150 1007 1006 1001 1000
1010 50 1007 1006 1001 1000
1011 50 1007 1006 1001 1000
1013 10000 1007 1006 1001 1000
1931 60 1008 1007 1006 1005
1141 1000 1014 1013 1007 1006
I need to make a new table where it is:
Cust 1014 1013 1008 1007 1006 1001 1000
1009 NA NA NA T *.1 T *.1 T*.05 T * .025
1010 NA NA NA T *.1 T *.1 T*.05 T * .025
1011 NA NA NA T *.1 T *.1 T*.05 T * .025
1013 NA NA NA T *.1 T *.1 T*.05 T * .025
1931 NA NA T*.1 T *.1 T*.05 T * .025 NA
1141 T*.1 T *.1 NA T*.05 T * .025 NA NA
I just can't seem to figure it out and I'm not even sure if it is possible.
A tidyverse solution:
library(tidyverse)
df %>% gather(select = -c(Cust, T)) %>%
select(-key) %>%
spread(value, T) %>%
map2_dfc(c(1, .025, .05, rep(.1, 6)), ~ .x * .y)
# Cust `1000` `1001` `1005` `1006` `1007` `1008` `1013` `1014`
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1009 3.75 7.5 NA 15 15 NA NA NA
# 2 1010 1.25 2.5 NA 5 5 NA NA NA
# 3 1011 1.25 2.5 NA 5 5 NA NA NA
# 4 1013 250 500 NA 1000 1000 NA NA NA
# 5 1141 NA NA NA 100 100 NA 100 100
# 6 1931 NA NA 6 6 6 6 NA NA
library(dplyr)
library(tidyr)
library(data.table)
df %>% gather(key=k,value = val, -c('Cust','T')) %>%
mutate(val_upd=ifelse(k=='S1'|k=='S2','T*.1',ifelse(k=='S3','T*.05','T*.025'))) %>%
#Change 'T*.1' to T*.1 to get the actual value
select(-T,-k) %>% dcast(Cust~val,value.var='val_upd')
Cust 1000 1001 1005 1006 1007 1008 1013 1014
1 1009 T*.025 T*.05 <NA> T*.1 T*.1 <NA> <NA> <NA>
2 1010 T*.025 T*.05 <NA> T*.1 T*.1 <NA> <NA> <NA>
3 1011 T*.025 T*.05 <NA> T*.1 T*.1 <NA> <NA> <NA>
4 1013 T*.025 T*.05 <NA> T*.1 T*.1 <NA> <NA> <NA>
5 1141 <NA> <NA> <NA> T*.025 T*.05 <NA> T*.1 T*.1
6 1931 <NA> <NA> T*.025 T*.05 T*.1 T*.1 <NA> <NA>
Data
df <- read.table(text = "
Cust T S1 S2 S3 S4
1009 150 1007 1006 1001 1000
1010 50 1007 1006 1001 1000
1011 50 1007 1006 1001 1000
1013 10000 1007 1006 1001 1000
1931 60 1008 1007 1006 1005
1141 1000 1014 1013 1007 1006
", header=TRUE)
This is one way using a combination of reshape2::melt, dplyr::select, tidyr::spread and dplyr::mutate. May not be the best way, but it should do what you want:
# Read the data (if you don't already have it loaded)
df <- read.table(text="Cust T S1 S2 S3 S4
1009 150 1007 1006 1001 1000
1010 50 1007 1006 1001 1000
1011 50 1007 1006 1001 1000
1013 10000 1007 1006 1001 1000", header=T)
# Manipulate your data.frame. Replace df with the name of your data.frame
reshape2::melt(df, c("Cust", "T"), c("S1", "S2", "S3", "S4")) %>%
dplyr::select(-variable) %>%
tidyr::spread(value, T) %>%
dplyr::mutate(`1007`=`1007`*0.1,
`1006`=`1006`*0.1,
`1001`=`1001`*0.05,
`1000`=`1000`*0.025)
# Cust 1000 1001 1006 1007
#1 1009 3.75 7.5 15 15
#2 1010 1.25 2.5 5 5
#3 1011 1.25 2.5 5 5
#4 1013 250.00 500.0 1000 1000
You'll need the backticks as R doesn't handle having numeric colnames very well.
Let me know if I've misunderstood anything/something doesn't make sense
dataset:
zip acs.pop napps pperct cgrp zgrp perc
1: 12007 97 2 2.0618557 2 1 25.000000
2: 12007 97 2 2.0618557 NA 2 50.000000
3: 12007 97 2 2.0618557 1 1 25.000000
4: 12008 485 2 0.4123711 2 1 33.333333
5: 12008 485 2 0.4123711 4 1 33.333333
6: 12008 485 2 0.4123711 NA 1 33.333333
7: 12009 7327 187 2.5522042 4 76 26.206897
8: 12009 7327 187 2.5522042 1 41 14.137931
9: 12009 7327 187 2.5522042 2 23 7.931034
10: 12009 7327 187 2.5522042 NA 103 35.517241
11: 12009 7327 187 2.5522042 3 47 16.206897
12: 12010 28802 580 2.0137490 NA 275 32.163743
13: 12010 28802 580 2.0137490 4 122 14.269006
14: 12010 28802 580 2.0137490 1 269 31.461988
15: 12010 28802 580 2.0137490 2 96 11.228070
16: 12010 28802 580 2.0137490 3 93 10.877193
17: 12018 7608 126 1.6561514 3 30 16.129032
18: 12018 7608 126 1.6561514 NA 60 32.258065
19: 12018 7608 126 1.6561514 2 14 7.526882
20: 12018 7608 126 1.6561514 4 57 30.645161
21: 12018 7608 126 1.6561514 1 25 13.440860
22: 12019 14841 144 0.9702850 NA 62 30.097087
23: 12019 14841 144 0.9702850 4 73 35.436893
24: 12019 14841 144 0.9702850 3 30 14.563107
25: 12019 14841 144 0.9702850 1 23 11.165049
26: 12019 14841 144 0.9702850 2 18 8.737864
27: 12020 31403 343 1.0922523 3 76 14.960630
28: 12020 31403 343 1.0922523 1 88 17.322835
29: 12020 31403 343 1.0922523 2 38 7.480315
30: 12020 31403 343 1.0922523 4 141 27.755906
31: 12020 31403 343 1.0922523 NA 165 32.480315
32: 12022 1002 5 0.4990020 NA 4 44.444444
33: 12022 1002 5 0.4990020 4 2 22.222222
34: 12022 1002 5 0.4990020 3 1 11.111111
35: 12022 1002 5 0.4990020 1 1 11.111111
I know the reshape2 or reshape package can handle this, but I'm not sure how. I need the final output to look like this:
zip acs.pop napps pperct zgrp4 zgrp3 zgrp2 zgrp1 perc4 perc3 perc2 perc1
12009 7327 187 2.5522042 76 47 23 41 26.206897 16.206897 7.931034 14.137931
zip is the id
acs.pop, napps, pperct will be the same for each zip group
zgrp4…zgrp1 are the values of zgrp for each value of cgrp
perc4…perc1 are the values of perc for each value of cgrp
We can try dcast from the devel version of data.table which can take multiple value.var columns. In this case, we have 'zgrp' and 'perc' are the value columns. Using the grouping variables, we create an sequence variable ('ind') and then use dcast to convert from 'long' to 'wide' format.
Instructions to install the devel version are here
library(data.table)#v1.9.5
setDT(df1)[, ind:= 1:.N, .(zip, acs.pop, napps, pperct)]
dcast(df1, zip+acs.pop + napps+pperct~ind, value.var=c('zgrp', 'perc'))
# zip acs.pop napps pperct 1_zgrp 2_zgrp 3_zgrp 4_zgrp 5_zgrp 1_perc
#1: 12007 97 2 2.0618557 1 2 1 NA NA 25.00000
#2: 12008 485 2 0.4123711 1 1 1 NA NA 33.33333
#3: 12009 7327 187 2.5522042 76 41 23 103 47 26.20690
#4: 12010 28802 580 2.0137490 275 122 269 96 93 32.16374
#5: 12018 7608 126 1.6561514 30 60 14 57 25 16.12903
#6: 12019 14841 144 0.9702850 62 73 30 23 18 30.09709
#7: 12020 31403 343 1.0922523 76 88 38 141 165 14.96063
#8: 12022 1002 5 0.4990020 4 2 1 1 NA 44.44444
# 2_perc 3_perc 4_perc 5_perc
#1: 50.00000 25.000000 NA NA
#2: 33.33333 33.333333 NA NA
#3: 14.13793 7.931034 35.51724 16.206897
#4: 14.26901 31.461988 11.22807 10.877193
#5: 32.25807 7.526882 30.64516 13.440860
#6: 35.43689 14.563107 11.16505 8.737864
#7: 17.32284 7.480315 27.75591 32.480315
#8: 22.22222 11.111111 11.11111 NA
Or we can use ave/reshape from base R
df2 <- transform(df1, ind=ave(seq_along(zip), zip,
acs.pop, napps, pperct, FUN=seq_along))
reshape(df2, idvar=c('zip', 'acs.pop', 'napps', 'pperct'),
timevar='ind', direction='wide')
This is a good use for spread() in tidyr.
df %>% filter(!is.na(cgrp)) %>% # if cgrp is missing I don't know where to put the obs
gather(Var, Val,6:7) %>% # one row per measure (zgrp OR perc) observed
group_by(zip, acs.pop, napps, pperct) %>% # unique combos of these will define rows in output
unite(Var1,Var,cgrp) %>% # indentify which obs for which measure
spread(Var1, Val) # make columns for zgrp_1, _2, etc., perc1,2, etc
Example output:
> df2[df2$zip==12009,]
Source: local data frame [1 x 12]
zip acs.pop napps pperct perc_1 perc_2 perc_3 perc_4 zgrp_1 zgrp_2 zgrp_3 zgrp_4
1 12009 7327 187 2.552204 14.13793 7.931034 16.2069 26.2069 41 23 47 76
Thanks to #akrun for the assist