split columns according to sequence numbers - r

I have a dataset like this:
seq X
1 a
2 b
3 c
1 d
2 e
1 f
2 g
3 h
4 i
5 j
And I would like to split/group the columns according to the assigned seq, like this:
seq X seq1 X1 seq2 X2
1 a 1 d 1 f
2 b 2 e 2 g
3 c NA NA 3 h
NA NA NA NA 4 i
NA NA NA NA 5 j
Thank you in advance

We need to split the data frame first and apply a custom function that merges unequal data frames, i.e.
do.call(cbindPad, split(df, cumsum(df$seq == 1)))
# 1.seq 1.X 2.seq 2.X 3.seq 3.X
#1 1 a 1 d 1 f
#2 2 b 2 e 2 g
#3 3 c NA <NA> 3 h
#4 NA <NA> NA <NA> 4 i
#5 NA <NA> NA <NA> 5 j
where cbindpad was taken by #joran answer at this post

this was just for exploration, #Sotos something to this kind would work? bdw this has lots of transposing which is not efficient
df1 = split(df, cumsum(df$seq == 1))
df2 = lapply(df1 , function(x) as.data.frame(t(x)))
#$`1`
# V1 V2 V3
#seq 1 2 3
#X a b c
#$`2`
# V1 V2
#seq 1 2
#X d e
#$`3`
# V1 V2 V3 V4 V5
#seq 1 2 3 4 5
#X f g h i j
data.frame(t(rbind.fill(df2)))
# X1 X2 X3 X4 X5 X6
#V1 1 a 1 d 1 f
#V2 2 b 2 e 2 g
#V3 3 c <NA> <NA> 3 h
#V4 <NA> <NA> <NA> <NA> 4 i
#V5 <NA> <NA> <NA> <NA> 5 j

Related

Join specific columns of matching rows

I have this data frame:
patientcA 1 2 NA NA b c
patientcB NA NA 3 4 b c
patientdA 3 3 NA NA d e
patientdB NA NA 5 6 d e
How can I join columns 2,3,4 and 5 for those rows which match in column 1 except for the last character. In this case, first two rows match except for the last character; and last two rows do the same. So my expected output would be:
patientcA 1 2 3 4 b c
patientcB 1 2 3 4 b c
patientdA 3 3 5 6 d e
patientdB 3 3 5 6 d e
I have tried something like this, but I don't know what to write as else argument. Moreover I think this is not the best approach:
new_data$first_column<-ifelse(grepl('A$', original_data$first), original_data$first, ?)
Maybe you might consider a tidyverse approach that uses separate to put the last character of column 1 into a new column, and fill to replace NA with values for the same patient.
library(tidyverse)
df %>%
separate(V1, into = c("patient", "letter"), sep = -1) %>%
group_by(patient) %>%
fill(V2:V5, .direction = "downup")
Output
patient letter V2 V3 V4 V5 V6 V7
<chr> <chr> <int> <int> <int> <int> <chr> <chr>
1 patientc A 1 2 3 4 b c
2 patientc B 1 2 3 4 b c
3 patientd A 3 3 5 6 d e
4 patientd B 3 3 5 6 d e
You could write a vectorized function like CC() below, that completes columns, then split-apply-combine with by.
CC <- Vectorize(function(x) if (any(is.na(x))) rep(x[!is.na(x)], length(x)) else x)
res <- do.call(rbind.data.frame, by(dat, substr(dat$V1, 8, 8), CC))
res
# V1 V2 V3 V4 V5 V6 V7
# c.1 patientcA 1 2 3 4 b c
# c.2 patientcB 1 2 3 4 b c
# d.1 patientdA 3 3 5 6 d e
# d.2 patientdB 3 3 5 6 d e

Shift cells in a data frame to the right based on row values

I'm working with a data frame that was created from multiple tab separated text files imported as tibbles that are joined together using rbind . The files all contain similar column names but some of the values are under the wrong column when imported due to comments that are added in the creation of these files. I'm attempting to shift the non-comment cell values to their adjacent right column. I cannot alter the original files in any way.
Initial Example Data Frames Code:
df1<-df<-data.frame(
c1=c("A","B","C",1,1,1),
c2=c(1,1,1,5,NA,5),
c3=c(5,5,5,"C","C","C"),
c4=c("C","C","C",NA,NA,NA)
)
df2<-data.frame(
c1=c("A","B","F",2,2,2),
c2=c(2,2,2,6,6,6),
c3=c(6,6,6,"D","D","D"),
c4=c("D","D","D",NA,NA,NA)
)
Initial Example Data Frames:
> df1
c1 c2 c3 c4
1 A 1 5 C
2 B 1 5 C
3 C 1 5 C
4 1 5 C <NA>
5 1 NA C <NA>
6 1 5 C <NA>
> df2
c1 c2 c3 c4
1 A 2 6 D
2 B 2 6 D
3 F 2 6 D
4 2 NA D <NA>
5 2 6 D <NA>
6 2 6 D <NA>
Compiled Data Frame Code:
df<-rbind(df1,df2)
Compiled Data Frame:
> df
c1 c2 c3 c4
1 A 1 5 C
2 B 1 5 C
3 C 1 5 C
4 1 5 C <NA>
5 1 NA C <NA>
6 1 5 C <NA>
7 A 2 6 D
8 B 2 6 D
9 F 2 6 D
10 2 NA D <NA>
11 2 6 D <NA>
12 2 6 D <NA>
Desired Data Frame:
c1 c2 c3 c4
1 A 1 5 C
2 B 1 5 C
3 C 1 5 C
4 <NA> 1 5 C
5 <NA> 1 NA C
6 <NA> 1 5 C
7 A 2 6 D
8 B 2 6 D
9 D 2 6 D
10 <NA> 2 NA D
11 <NA> 2 6 D
12 <NA> 2 6 D
The comments and the repeating lines on the first coumln that are to be shifted a not always the same length or the same values nor are they always numeric.
I've tried using an altered version of a similar question to shift my values. Using R to shift values to the left of data.frame
df[]<-t(apply(df, 1, function(x) c(x[is.na(x)], x[!is.na(x)])))
However this code uses all NA values and some columns contain NA values so this would only work when the final column is NA and not any of the other columns.
> df
c1 c2 c3 c4
1 A 1 5 C
2 B 1 5 C
3 C 1 5 C
4 <NA> 1 5 C
5 <NA> <NA> 1 C
6 <NA> 1 5 C
7 A 2 6 D
8 B 2 6 D
9 F 2 6 D
10 <NA> <NA> 2 D
11 <NA> 2 6 D
12 <NA> 2 6 D
Is there a way for this code to only use the final column as a guide to shift the appropriate cells to the right?
Edit: typo on modified code from similar question df1 changed to df.
UPDATED: Here is the solution. It is cumbersome a bit, but it works. To preserve the initial order a good idea is to set an index as column and sort final df with it. Later on, you can revert column back to index by column_to_rownames or simply by droping it with select(df, -c('index')). With this, I hope I answered the question.
df <- rbind(df1,df2)
df <- mutate_all(df, as.character)
df <- rownames_to_column(df, 'index')
df_ok <- filter(df, !is.na(c4))
df_na <- filter(df, is.na(c4))
df_fin <- df_na %>%
select(c4, everything()) %>%
rename(c1 = c4, c2 = c1, c3 = c2, c4 = c3) %>%
rbind(df_ok)
df_fin <- df_fin %>%
mutate(index = as.integer(index)) %>%
arrange(index)

data.table merge() with NA in by column

I'm trying to join two tables where the column that is joined on has some NA values such that when the NA is encountered the record is padded with NA's i.e.
Given:
> x = data.table(c(1,2,3,NA,5), c("a","b","c","d","e"))
> x
V1 V2
1: 1 a
2: 2 b
3: 3 c
4: NA d
5: 5 e
> y = data.table(c(NA,2,3,4,5), c("A","B","C","D","E"))
> y
V1 V2
1: NA A
2: 2 B
3: 3 C
4: 4 D
5: 5 E
I want my output to be:
> z = data.table(c(NA,NA,1,2,3,4,5),c("d",NA,"a","b","c",NA,"e"),c(NA,"A",NA,"B","C","D","E"))
> z
V1 V2 V3
1: NA d NA
2: NA NA A
3: 1 a NA
4: 2 b B
5: 3 c C
6: 4 NA D
7: 5 e E
I thought merge() could be used to do this. But I can't get it to produce the output I expect:
> merge(x,y, by=c("V1"), all=TRUE)
V1 V2.x V2.y
1: NA d A
2: 1 a NA
3: 2 b B
4: 3 c C
5: 4 NA D
6: 5 e E
I really don't like that it merges based on the NA value as if it was a match, and when I do this in a larger table with several NA's, it seems to iterate over all possible combinations of column values for V1 and V2 given an NA key. Any help would be appreciated.
The dataframe method of merge has a incomparables-argument, which the data.table version of merge doesn't have.
So, using the dataframe method:
merge.data.frame(x, y, by = "V1", all = TRUE, incomparables = NA)
gives the intended result:
V1 V2.x V2.y
1 1 a <NA>
2 2 b B
3 3 c C
4 4 <NA> D
5 5 e E
6 NA d <NA>
7 NA <NA> A
NOTE: According to this GitHub-issue, the data.table developers are planning to include an incomparables-argument in merge.data.table in the future.

how to use R to transfer hourly passenger OD data to od matrix

I'm trying to transfer hourly passenger OD data to OD matrix.
My current dataframe looks something like this:
Hour Ostation Dstation Passengers
8 A B 2
8 A C 3
8 A D 4
8 B C 5
8 B D 6
8 C D 1
10 A B 4
10 A C 5
10 A D 6
10 B C 1
10 B D 2
10 C D 3
And I'd like for HOUR = 8:
A B C D
A
B 2
C 3 5
D 4 6 1
And HOUR = 10:
A B C D
A
B 4
C 5 1
D 6 2 3
I use split and table:
ODdata$Ostation <- factor(ODdata$Ostation)
ODdata$Dstation <- factor(ODdata$Dstation)
ODtable <-lapply(split(ODdata, ODdata$Hour),
function(x) table(x$Ostation, x$Dstation))
I can get the OD matrix, but the value is counts, not Passengers.
You should use dcast from reshape2 in place of table. Set drop = F to keep all factor levels in the output matrices.
library(reshape2)
ODtable <-lapply(split(ODdata, ODdata$Hour),
function(x) dcast(Dstation ~ Ostation , data = x,
value.var = "Passengers", drop = FALSE))
ODtable
#$`8`
# Dstation A B C D
#1 A NA NA NA NA
#2 B 2 NA NA NA
#3 C 3 5 NA NA
#4 D 4 6 1 NA
#
#$`10`
# Dstation A B C D
#1 A NA NA NA NA
#2 B 4 NA NA NA
#3 C 5 1 NA NA
#4 D 6 2 3 NA
sample data
ODdata <- data.frame(Hour = c(rep(8,6), rep(10,6)),
Ostation = factor(c("A","A","A","B","B","C","A","A","A","B","B","C"),
levels = c('A', 'B', 'C', 'D')),
Dstation = factor(c("B","C","D","C","D","D","B","C","D","C","D","D"),
levels = c('A', 'B', 'C', 'D')),
Passengers = c(2,3,4,5,6,1,4,5,6,1,2,3))

Frequencies data table multiple columns

I have a data table like this
require(data.table)
dt <- data.table(a= c("a","a","b","b","b"), b= c("a","a","c","c","e"), c=c("d","d","b","b","b"))
I want to count frequencies from all the columns. I know how to do it one by one, but I want to do it in one instruction because my data has a lot of columns.
Result must be this one:
dt[,a1:=.N, by = c("a")]
dt[,a2:=.N, by = c("b")]
dt[,a3:=.N, by = c("c")]
require(data.table)
dt <- data.table(a= c("a","a","b","b","b"),
b= c("a","a","c","c","e"),
c=c("d","d","b","b","b"))
#dt
# a b c
#1: a a d
#2: a a d
#3: b c b
#4: b c b
#5: b e b
l=lapply(seq_along(colnames(dt)),
function(i) dt[,eval(colnames(dt)[i]),with=F][, x:=.N,by=eval(colnames(dt)[i])])
#l
#[[1]]
# a x
#1: a 2
#2: a 2
#3: b 3
#4: b 3
#5: b 3
#[[2]]
# b x
#1: a 2
#2: a 2
#3: c 2
#4: c 2
#5: e 1
#[[3]]
# c x
#1: d 2
#2: d 2
#3: b 3
#4: b 3
#5: b 3
df = as.data.frame(l)
# replacing alternate column names with concatenating "_count" to it
colnames(df)[seq(2,length(colnames(df)),2)]=
paste0(colnames(df)[seq(1,length(colnames(df)),2)],"_count")
#df
# a a_count b b_count c c_count
#1 a 2 a 2 d 2
#2 a 2 a 2 d 2
#3 b 3 c 2 b 3
#4 b 3 c 2 b 3
#5 b 3 e 1 b 3

Resources