I am trying to convert a matrix to a dataframe and use a column name and row name in the matrix with variables in the dataframe.
here is the sample
sample = matrix(c(1,NA,NA,2,NA,3,NA,NA,5,NA,NA,6,NA,NA,NA,NA,8,NA,3,1),ncol = 4)
colnames(sample) = letters[1:4]
row.names(sample) = letters[22:26]
My dataset has a lot of NA so I am trying to remove all the NA in the dataframe.
so here is my desiring output,
data.frame(col = c("v","v","w","w","y","y","y","z"),
row = c("a","b","c","c","a","b","d","d"),
value = c(1,3,6,8,2,5,3,1))
Use melt from reshape2 package for reshaping, then clear NA. Finally, do some formating stuff to get your desired output (ordering, setting colnames...).
> library(reshape2)
> df <- na.omit(melt(sample)) # reshaping
> df <- df[order(df$Var1), ] # ordering
> colnames(df) <- c("col", "row", "value") # setting colnames
> df # getting desired output
col row value
1 v a 1
6 v b 3
12 w c 6
17 w d 8
4 y a 2
9 y b 5
19 y d 3
20 z d 1
With dplyr and magrittr
> library(magrittr)
> library(dplyr)
> sample %>% melt %>%
na.omit %>%
arrange(., Var1) %>%
setNames(c('col', 'row', 'value'))
col row value
1 v a 1
2 v b 3
3 w c 6
4 w d 8
5 y a 2
6 y b 5
7 y d 3
8 z d 1
Here is a base R method by replicating the row names and column names
out <- na.omit(data.frame(col = rownames(sample)[row(sample)],
row = colnames(sample)[col(sample)], value = c(sample)))
out <- out[order(out$col),]
row.names(out) <- NULL
out
# col row value
#1 v a 1
#2 v b 3
#3 w c 6
#4 w d 8
#5 y a 2
#6 y b 5
#7 y d 3
#8 z d 1
Related
I would like to match two columns based on another. I'm trying to use the match function but gets NA values.
a <- data.frame( x = c(1,2,3,4,5))
b <- data.frame( y = c(3,4),
z = c("A","B"))
a$x <- b$z[match(a$x, b$y)]
I get:
> a
x
1 <NA>
2 <NA>
3 A
4 B
5 <NA>
I would like :
> a
x
1 1
2 2
3 A
4 B
5 5
First, rename the numeric column of b so that you can merge the two data frames:
b <- b %>% rename(x = y)
Then, merge them, turn variables into character and replace the values of column x with those of z if not NA.
a <- merge(a, b, by = "x", all.x = TRUE) %>%
mutate_all(as.character) %>%
mutate(x = ifelse(is.na(z), x, z))
Result:
x z
1 1 <NA>
2 2 <NA>
3 A A
4 B B
5 5 <NA>
Without renaming I would propose this which ends with the same result that broti
tmp.merge<- merge(a,b,by.x = "x", by.y="y", all = TRUE)
for (elm in as.numeric(row.names(tmp.merge[which(!is.na(tmp.merge$z)),]))){
tmp.merge[elm,'x'] <- as.character(tmp.merge[elm,'z'])
}
tmp.merge
result :
> tmp.merge
x z
1 1 <NA>
2 2 <NA>
3 A A
4 B B
5 5 <NA>
The following works but you need to set stringsAsFactors = F, when defining dataframe b
a <- data.frame( x = c(1,2,3,4,10,13,12,11))
b <- data.frame( y = c(10,12,13),
z = c("A","B","C"),stringsAsFactors = F)
#
a %>% mutate(x = ifelse(x %in% b$y,b$z[match(x,b$y)],x))
Output
x
1 1
2 2
3 3
4 4
5 A
6 C
7 B
8 11
I want to recursively filter a dataframe, d by an arbitrary number of conditions (represented as rows in another dataframe z).
I begin with a dataframe d:
d <- data.frame(x = 1:10, y = letters[1:10])
The second dataframe z, has columns x1 and x2, which are lower and upper limits to filter d$x. This dataframe z may grow to be an arbitrary number of rows long.
z <- data.frame(x1 = c(1,3,8), x2 = c(1,4,10))
I want to return all rows of d for which d$x <= z$x1[i] and d$x >= z$x2[i] for all i, where i = nrow(z).
So for this toy example, exclude everything from 1:1, 3:4, 8:10, inclusive.
x y
2 2 b
5 5 e
6 6 f
7 7 g
We can create a sequence between x1 and x2 values and use anti_join to select rows from d that are not present in z.
library(tidyverse)
remove <- z %>%
mutate(x = map2(x1, x2, seq)) %>%
unnest(x) %>%
select(x)
anti_join(d, remove)
# x y
#1 2 b
#2 5 e
#3 6 f
#4 7 g
We can use a non-equi join
library(data.table)
i1 <- setDT(d)[z, .I, on = .(x >=x1, x <= x2), by = .EACHI]$I
i1
#[1] 1 3 4 8 9 10
d[i1]
# x y
#1: 1 a
#2: 3 c
#3: 4 d
#4: 8 h
#5: 9 i
#6: 10 j
d[!i1]
# x y
#1: 2 b
#2: 5 e
#3: 6 f
#4: 7 g
Or using fuzzyjoin
library(fuzzyjoin)
library(dplyr)
fuzzy_inner_join(d, z, by = c('x' = 'x1', 'x' = 'x2'),
match_fun = list(`>=`, `<=`)) %>%
select(names(d))
# A tibble: 6 x 2
# x y
# <int> <fct>
#1 1 a
#2 3 c
#3 4 d
#4 8 h
#5 9 i
#6 10 j
Or to get the rows not in 'x' from 'd'
fuzzy_anti_join(d, z, by = c('x' = 'x1', 'x' = 'x2'),
match_fun = list(`>=`, `<=`)) %>%
select(names(d))
# A tibble: 4 x 2
# x y
# <int> <fct>
#1 2 b
#2 5 e
#3 6 f
#4 7 g
Here is sample data:
df <- data.frame(t(data.frame(seq(1,10,1)))); rownames(df) <- NULL;
colnames(df) <- letters[1:ncol(df)]
df
I would like to arrange the new data.frame so that it always has 6 columns, the next row (after splinting since ncol>6) would contain the next 6 column names and next row their values. The last row if ncol<6 the values are filled with empty string including the column names.
Here is desired output:
a b c d e f
1 1 2 3 4 5 6
2 g h i j
3 7 8 9 10
Another example:
df <- data.frame(t(data.frame(seq(1,15,1)))); rownames(df) <- NULL;
colnames(df) <- letters[1:ncol(df)]
df
a b c d e f
1 1 2 3 4 5 6
2 g h i j k l
3 7 8 9 10 11 12
4 m n o
5 13 14 15
EDIT:
The way to approach it possibly is to:
n <- 6
ncl <- nrow(df)
s <- split(df, rep(1:ceiling(ncl/n), each=n, length.out=ncl))
s
s1 <- split(rownames(df), rep(1:ceiling(ncl/n), each=n, length.out=ncl))
s1
combine every second split of s and s1
s1[c(TRUE,FALSE)]
Here's a way, not so pretty, but this is an ugly question :D
library(tibble)
library(dplyr)
df1 <- matrix(c(names(df),rep('',6 - ncol(df)%%6)) %>% unlist, ncol=6,byrow=T) %>% as_tibble %>% rowid_to_column()
df2 <- matrix(c(df ,rep('',6 - ncol(df)%%6)) %>% unlist, ncol=6,byrow=T) %>% as_tibble %>% rowid_to_column()
bind_rows(df1,df2) %>% arrange(rowid) %>% select(-1) %>% setNames(.[1,]) %>% slice(-1)
# # A tibble: 3 x 6
# a b c d e f
# <chr> <chr> <chr> <chr> <chr> <chr>
# 1 1 2 3 4 5 6
# 2 g h i j
# 3 7 8 9 10
For the life of me I can't figure out a use-case for this... but for sake of the provided examples...
seq(1, ncol(df), by = 6) %>% {
starts <- .
ends <- c(lead(.,1,NULL)-1, ncol(df))
base_df <- df[,starts[[1]]:ends[[1]]]
rbind(base_df, rbind.pages(Map(function(s, e){
d <- df[,seq(s, e)]
data.frame(rbind(colnames(d), d)) %>% setNames(colnames(base_df)[1:length(.)])
}, s = starts[-1], e = ends[-1]))
) %>%
mutate_all(function(x){
ifelse(!is.na(x), x, "")
})
}
a b c d e f
1 1 2 3 4 5 6
2 g h i j k l
3 7 8 9 10 11 12
4 m n o
5 13 14 15
EDIT to coerce NA to 'empty string'
Here is my issue:
df1 <- data.frame(x = 1:5, y = 2:6, z = 3:7)
rownames(df1) <- LETTERS[1:5]
df1
x y z
A 1 2 3
B 2 3 4
C 3 4 5
D 4 5 6
E 5 6 7
df2 <- data.frame(x = 1:5, y = 2:6, z = 3:7)
rownames(df2) <- LETTERS[3:7]
df2
x y z
C 1 2 3
D 2 3 4
E 3 4 5
F 4 5 6
G 5 6 7
what I wanted is:
x y z
A 1 2 3
B 2 3 4
C 4 6 8
D 6 8 10
E 8 10 12
F 4 5 6
G 5 6 7
where duplicated rows were added up by same variable.
A solution with base R:
# create a new variable from the rownames
df1$rn <- rownames(df1)
df2$rn <- rownames(df2)
# bind the two dataframes together by row and aggregate
res <- aggregate(cbind(x,y,z) ~ rn, rbind(df1,df2), sum)
# or (thx to #alistaire for reminding me):
res <- aggregate(. ~ rn, rbind(df1,df2), sum)
# assign the rownames again
rownames(res) <- res$rn
# get rid of the 'rn' column
res <- res[, -1]
which gives:
> res
x y z
A 1 2 3
B 2 3 4
C 4 6 8
D 6 8 10
E 8 10 12
F 4 5 6
G 5 6 7
With dplyr,
library(dplyr)
# add rownames as a column in each data.frame and bind rows
bind_rows(df1 %>% add_rownames(),
df2 %>% add_rownames()) %>%
# evaluate following calls for each value in the rowname column
group_by(rowname) %>%
# add all non-grouping variables
summarise_all(sum)
## # A tibble: 7 x 4
## rowname x y z
## <chr> <int> <int> <int>
## 1 A 1 2 3
## 2 B 2 3 4
## 3 C 4 6 8
## 4 D 6 8 10
## 5 E 8 10 12
## 6 F 4 5 6
## 7 G 5 6 7
could also vectorize the operation turning the dfs to matrices:
result_df <- as.data.frame(as.matrix(df1) + as.matrix(df2))
This might need some teaking to get the rownames logic working on a longer example:
dfr <-rbind(df1,df2)
do.call(rbind, lapply( split(dfr, sapply(rownames(dfr),substr,1,1)), colSums))
x y z
A 1 2 3
B 2 3 4
C 4 6 8
D 6 8 10
E 8 10 12
F 4 5 6
G 5 6 7
If the rownames could all be assumed to be alpha characters a gsub solution should be easy.
An alternative is to melt the data and cast it. At first we set the row names to the last column of both data frames thanks to #Jaap
df1$rn <- rownames(df1)
df2$rn <- rownames(df2)
Then we melt the data based on the name
melt(list(df1, df2), id.vars = "rn")
Then we use dcast with mget function which is used to retrieve multiple variables at once.
mydf<- dcast(melt(mget(ls(pattern = "df\\d+")), id.vars = "rn"),
rn ~ variable, value.var = "value", fun.aggregate = sum)
rownames(mydf) <- mydf$rn
# get rid of the 'rn' column
mydf <- mydf[, -1]
> mydf
# x y z
#A 1 2 3
#B 2 3 4
#C 4 6 8
#D 6 8 10
#E 8 10 12
#F 4 5 6
#G 5 6 7
There are many answers for how to split a dataframe, for example How to split a data frame?
However, I'd like to split a dataframe so that the smaller dataframes contain the last row of the previous dataframe and the first row of the following dataframe.
Here's an example
n <- 1:9
group <- rep(c("a","b","c"), each = 3)
data.frame(n = n, group)
n group
1 1 a
2 2 a
3 3 a
4 4 b
5 5 b
6 6 b
7 7 c
8 8 c
9 9 c
I'd like the output to look like:
d1 <- data.frame(n = 1:4, group = c(rep("a",3),"b"))
d2 <- data.frame(n = 3:7, group = c("a",rep("b",3),"c"))
d3 <- data.frame(n = 6:9, group = c("b",rep("c",3)))
d <- list(d1, d2, d3)
d
[[1]]
n group
1 1 a
2 2 a
3 3 a
4 4 b
[[2]]
n group
1 3 a
2 4 b
3 5 b
4 6 b
5 7 c
[[3]]
n group
1 6 b
2 7 c
3 8 c
4 9 c
What is an efficient way to accomplish this task?
Suppose DF is the original data.frame, the one with columns n and group. Let n be the number of rows in DF. Now define a function extract which given a sequence of indexes ix enlarges it to include the one prior to the first and after the last and then returns those rows of DF. Now that we have defined extract, split the vector 1, ..., n by group and apply extract to each component of the split.
n <- nrow(DF)
extract <- function(ix) DF[seq(max(1, min(ix) - 1), min(n, max(ix) + 1)), ]
lapply(split(seq_len(n), DF$group), extract)
$a
n group
1 1 a
2 2 a
3 3 a
4 4 b
$b
n group
3 3 a
4 4 b
5 5 b
6 6 b
7 7 c
$c
n group
6 6 b
7 7 c
8 8 c
9 9 c
Or why not try good'ol by, which "[a]ppl[ies] a Function to a Data Frame Split by Factors [INDICES]".
by(data = df, INDICES = df$group, function(x){
id <- c(min(x$n) - 1, x$n, max(x$n) + 1)
na.omit(df[id, ])
})
# df$group: a
# n group
# 1 1 a
# 2 2 a
# 3 3 a
# 4 4 b
# --------------------------------------------------------------------------------
# df$group: b
# n group
# 3 3 a
# 4 4 b
# 5 5 b
# 6 6 b
# 7 7 c
# --------------------------------------------------------------------------------
# df$group: c
# n group
# 6 6 b
# 7 7 c
# 8 8 c
# 9 9 c
Although the print method of by creates a 'fancy' output, the (default) result is a list, with elements named by the levels of the grouping variable (just try str and names on the resulting object).
I was going to comment under #cdetermans answer but its too late now.
You can generalize his approach using data.table::shift (or dyplr::lag) in order to find the group indices and then run a simple lapply on the ranges, something like
library(data.table) # v1.9.6+
indx <- setDT(df)[, which(group != shift(group, fill = TRUE))]
lapply(Map(`:`, c(1L, indx - 1L), c(indx, nrow(df))), function(x) df[x,])
# [[1]]
# n group
# 1: 1 a
# 2: 2 a
# 3: 3 a
# 4: 4 b
#
# [[2]]
# n group
# 1: 3 a
# 2: 4 b
# 3: 5 b
# 4: 6 b
# 5: 7 c
#
# [[3]]
# n group
# 1: 6 b
# 2: 7 c
# 3: 8 c
# 4: 9 c
Could be done with data.frame as well, but is there ever a reason not to use data.table? Also this has the option to be executed with parallelism.
library(data.table)
n <- 1:9
group <- rep(c("a","b","c"), each = 3)
df <- data.table(n = n, group)
df[, `:=` (group = factor(df$group))]
df[, `:=` (group_i = seq_len(.N), group_N = .N), by = "group"]
library(doParallel)
groups <- unique(df$group)
foreach(i = seq(groups)) %do% {
df[group == groups[i] | (as.integer(group) == i + 1 & group_i == 1) | (as.integer(group) == i - 1 & group_i == group_N), c("n", "group"), with = FALSE]
}
[[1]]
n group
1: 1 a
2: 2 a
3: 3 a
4: 4 b
[[2]]
n group
1: 3 a
2: 4 b
3: 5 b
4: 6 b
5: 7 c
[[3]]
n group
1: 6 b
2: 7 c
3: 8 c
4: 9 c
Here is another dplyr way:
library(dplyr)
data =
data_frame(n = n, group) %>%
group_by(group)
firsts =
data %>%
slice(1) %>%
ungroup %>%
mutate(new_group = lag(group)) %>%
slice(-1)
lasts =
data %>%
slice(n()) %>%
ungroup %>%
mutate(new_group = lead(group)) %>%
slice(-n())
bind_rows(firsts, data, lasts) %>%
mutate(final_group =
ifelse(is.na(new_group),
group,
new_group) ) %>%
arrange(final_group, n) %>%
group_by(final_group)