there is a matrix:
mat<-matrix(0,ncol = 10, nrow = 5)
colnames(mat)<-c("A1","A2","A3","A4","A5","A6","A7","A8","A9","A10")
rownames(mat)<-c("ID_1", "ID_2", "ID_3", "ID_4", "ID_5")
mat[1,] <-c(0,0,1,1,1,1,0,0,0,0)
mat[2,]<-c(0,0,0,1,1,1,0,0,0,0)
mat[3,]<-c(0,0,0,1,1,1,1,1,1,0)
mat[4,]<-c(0,0,0,0,0,1,1,1,1,0)
mat[5,]<-c(0,0,0,0,0,0,1,1,1,1)
I want to convert this matrix into a table with three columns - "ID", "start" and "stop", where "start" is a column with the first value (1) in row "ID", "stop" is a column with the last value in the row. I would like to receive this output:
Could You please help me?
Thanks in advance.
Here is one way using dplyr, converting the matrix to dataframe, convert rownames to column, get the data in long format, filter rows with value = 1 and select first and last column name for each id.
library(dplyr)
mat %>%
as.data.frame() %>%
tibble::rownames_to_column('id') %>%
tidyr::pivot_longer(cols = -id) %>%
filter(value == 1) %>%
group_by(id) %>%
summarise(start = first(name), stop = last(name))
# A tibble: 5 x 3
# id start stop
# <chr> <chr> <chr>
#1 ID_1 A3 A6
#2 ID_2 A4 A6
#3 ID_3 A4 A9
#4 ID_4 A6 A9
#5 ID_5 A7 A10
In base R and keeping mat as matrix :
t(apply(mat, 1, function(x) {
inds <- which(x == 1)
c(start = colnames(mat)[min(inds)], stop = colnames(mat)[max(inds)])
}))
You can do this using the ties.method argument in max.col. Use the result to subset the colnames.
data.frame(id = rownames(mat),
start = colnames(mat)[max.col(mat, "first")],
stop = colnames(mat)[max.col(mat, "last")])
# id start stop
# 1 ID_1 A3 A6
# 2 ID_2 A4 A6
# 3 ID_3 A4 A9
# 4 ID_4 A6 A9
# 5 ID_5 A7 A10
Related
Hello guys I hope you are doing great I have the following dataset,
I have the following dataset:
A1
A2
A3
A4
A5
A6
A7
-1.2650612
-0.6868529
-0.4456620
1.2240818
0.3598138
0.4007715
0.1106827
colnames<-c(paste0(1:7,"A"))
set.seed(123)
values<-c(rnorm(7))
I want to be able to create a code that will find the highest value (row-wise)found within the first 3 columns (A1,A2,A3) and write the corresponding column name, repeat the same for the following 2 columns and the same for the remaining 2 columns (A6,A7), if all values are the same (maybe it can happen then just write the name of the first columns withint the group)
mu desired output will be:
A1,A2,A3
A4,A5
A6,7
A3
A4
A5
Thank you gusy so much, I will be very attending to reward the ans to whom can helpe me!
Here is one option
library(dplyr)
library(tibble)
library(tidyr)
pivot_longer(df1, everything()) %>%
group_by(group = rep(1:3, c(3, 2, 2))) %>%
summarise(name1 = name[which.max(value)],
name2 = toString(name)) %>%
select(name2, name1) %>%
deframe %>%
as_tibble_row
-output
# A tibble: 1 × 3
`A1, A2, A3` `A4, A5` `A6, A7`
<chr> <chr> <chr>
1 A3 A4 A6
data
df1 <- structure(list(A1 = -1.2650612, A2 = -0.6868529, A3 = -0.445662,
A4 = 1.2240818, A5 = 0.3598138, A6 = 0.4007715, A7 = 0.1106827),
class = "data.frame", row.names = c(NA,
-1L))
I have a single column dataframe with all possible IDs:
ID
a1
a2
b1
b11
c1
I get dataframe from my database with same column "ID". But in that dataframe, not all IDs might be. Here is example of that table:
ID value
a1 18
a2 10
b1 10
I want to bind those tw tables in that way, so IDs which were not in my table have value zero. So, how to bind these two tables to get this:
ID value
a1 18
a2 10
b1 10
b11 0
c1 0
join the two tables and replace NA value with 0.
Using dplyr :
library(dplyr)
df1 %>%
full_join(df2, by = 'ID') %>%
mutate(value = replace(value, is.na(value), 0))
# ID value
#1 a1 18
#2 a2 10
#3 b1 10
#4 b11 0
#5 c1 0
In base R, you can do this as :
transform(merge(df1, df2, by = 'ID', all = TRUE),
value = replace(value, is.na(value), 0))
We can also do
library(dplyr)
df1 %>%
full_join(df2, by = 'ID') %>%
mutate(value = case_when(is.na(value) ~ 0, TRUE ~ value))
I have a dataframe and external vector. I need to count the repetitive elements in a vector (using table() I guess). If those values are located in a dataframe, I need to add this count number in an individual columns. Here is an example:
set.seed(5)
df1 = data.frame(numb = runif(5),
ID = c("a1", "a2", "a3", "a4", "a5"))
numb ID
1 0.2002145 a1
2 0.6852186 a2
3 0.9168758 a3
4 0.2843995 a4
5 0.1046501 a5
# get external vector
vect1 = c("a1", "a1", "a5", "a1")
# count repetitive elements in a vector
my.tab <- table(vect1)
vect1
a1 a5
3 1
I know that I can access the elements of table object:
# get vector of table names
names(my.tab)
[1] "a1" "a5"
# get number of repetition
as.vector(my.tab)
[1] 3 1
But how to add those values to my data.frame by row value?
I though that something like this could work:
df1$repID <- ifelse(df1$ID %in% vect1,
if YES = add count from table,
in NOT = add 1) # or any number
Expected output:
numb ID repID
1 0.4089769 a1 3
2 0.8830174 a2 1
3 0.9404673 a3 1
4 0.0455565 a4 1
5 0.5281055 a5 1
We can use stack to convert named vector from table to dataframe, merge it with df1 and replace NA values with 1.
transform(merge(df1, stack(table(vect1)), by.x = "ID", by.y = "ind", all.x = TRUE),
values = replace(values, is.na(values), 1))
# ID numb values
#1 a1 0.2002145 3
#2 a2 0.6852186 1
#3 a3 0.9168758 1
#4 a4 0.2843995 1
#5 a5 0.1046501 1
The same logic in tidyverse can be implemented as
library(tidyverse)
left_join(df1, table(vect1) %>% enframe(), by = c('ID' = 'name')) %>%
mutate(value = replace_na(value, 1))
The tidyverse solution in full:
# the initial dataframe
set.seed(5)
df1 <- data.frame(numb = runif(5),
ID = c("a1", "a2", "a3", "a4", "a5"),
stringsAsFactors = FALSE)
# get external vector
vect1 <- c("a1", "a1", "a5", "a1")
# put this in a dataframe
df2 <- data.frame(ID = vect1, stringsAsFactors = FALSE)
df2 <- df2 %>%
group_by(ID) %>% # group the dataframe
summarise(repID = n()) # and then summarize over the groups
# and finally...
df1 %>% # take the original data frame
left_join(df2, by = "ID") %>% # left join the aggregated data frame
mutate(repID = replace_na(repID, 1)) # and then remove the NAs by 1s
You could use data.table.
The basic strategy is to create two data.table's. Use grouping to count the occurrences of the values in the external vector. Then merge this grouped count table and the other one using a full join. This will gives us NA where there are no shared values in the relevant columns. We backfill the NA values with 1.
> library(data.table)
> my_count <- as.data.table(vect1)[, .(repID = .N), by = vect1] # Using .N to count.
> dt <- setDT(df1)
> data <- merge(dt, my_count, by.x = "ID", by.y = "vect1", all = TRUE) # Merge option all = TRUE is for a "full join".
> setnafill(data, cols = "repID", fill = 1)
Yields:
> data
ID numb repID
1: a1 0.2002145 3
2: a2 0.6852186 1
3: a3 0.9168758 1
4: a4 0.2843995 1
5: a5 0.1046501 1
If you want your final data as a data.frame use setDF.
Hi I have a messy data frame as follows:
df <- data.frame(age.band = c("0-5","5-10"), beg.code = c("A1","B1"), end.code=c("A5","B3"),value = c(10,5))
age.band beg.code end.code value
0-5 A1 A5 10
5-10 B1 B3 5
I would like to transform it into a friendlier format such as:
index age.band value
A1 0-5 10
A2 0-5 10
A3 0-5 10
A4 0-5 10
A5 0-5 10
B1 5-10 5
B2 5-10 5
B3 5-10 5
Can anyone help me to find a way to add all the missing indexes for this dataframe? Thanks
A solution using dplyr and tidyr. Nptice that I added stringsAsFactors = FALSE to avoid creating factor columns when creating your example data frame. If you run the code on your original data frame, you will receive warning message due to the factor columns, but it will not affect the end results.
library(dplyr)
library(tidyr)
df2 <- df %>%
gather(Code, Value, ends_with("code")) %>%
extract(Value, into = c("Group", "Index"), regex = "([A-Za-z+].*)([\\d].*$)",
convert = TRUE) %>%
select(-Code) %>%
group_by(Group) %>%
complete(Index = full_seq(Index, period = 1)) %>%
unite(Index, c("Group", "Index"), sep = "") %>%
fill(-Index)
df2
# # A tibble: 8 x 3
# Index age.band value
# * <chr> <chr> <dbl>
# 1 A1 0-5 10
# 2 A2 0-5 10
# 3 A3 0-5 10
# 4 A4 0-5 10
# 5 A5 0-5 10
# 6 B1 5-10 5
# 7 B2 5-10 5
# 8 B3 5-10 5
DATA
df <- data.frame(age.band = c("0-5","5-10"), beg.code = c("A1","B1"), end.code=c("A5","B3"),value = c(10,5),
stringsAsFactors = FALSE)
Here is one option with base R. The idea is to remove the non-numeric characters from the 'code' columns, convert it to numeric and get the sequence stored as a list. Then, paste the non-numeric characters and finally, based on the lengths of the list, expand the rows of the original dataset with rep and create a new column 'index' by unlisting the list
lst <- do.call(Map, c(f = `:`, lapply(df[2:3], function(x) as.numeric(sub("\\D+", "", x)))))
lst1 <- Map(paste0, substr(df[,2], 1, 1), lst)
data.frame(index = unlist(lst1), df[rep(seq_len(nrow(df)), lengths(lst1)), -(2:3)])
I have a dataframe (df) with three columns like so:
Structure:
id id1 age
A1 a1 32
A1 a2 45
A1 a3 45
A1 a4 12
A2 b1 15
A2 b5 34
A2 b64 17
Expected Output:
id count count1
A1 4 1
A2 3 2
Logic:
Column "count" is the number of times "id" is repeated
Column "count1" is the number of rows where age is less than 21
Current Code:
library(dplyr)
df_summarized <- df %>%
group_by(id) >%>
summarise(count = n(),count1 = count(age<21))
Problem:
Error: no applicable method for 'group_by_' applied to an object of class "logical"
We need to do the sum
df %>%
group_by(id) %>%
summarise(count = n(),count1 = sum(age < 21))
# A tibble: 2 × 3
# id count count1
# <chr> <int> <int>
#1 A1 4 1
#2 A2 3 2
as count applies to data.frame or tbl_df and not in a single column inside the summarise
Or using data.table
library(data.table)
setDT(df)[, .(count = .N, count1 = sum(age < 21)), id]
Or with base R
cbind(count = rowSums(table(df[-2])), count1 = as.vector(rowsum(+(df$age < 21), df$id)))
# count count1
#A1 4 1
#A2 3 2
Or using aggregate based on the sum
do.call(data.frame, aggregate(age~id, df, FUN =
function(x) c(count = length(x), count1 = sum(x<21))))
NOTE: All the above methods give the dataset with proper columns. This will be especially noted in aggregate. That is the reason the output column i.e. a matrix is converted to proper columns with do.call(data.frame
With base R, we can use aggregate to find number of rows for each group (id) as well as number of rows with value less than 21
aggregate(age~id, df, function(x) c(count = length(x),
count1 = length(x[x < 21])))
# id age.count age.count1
#1 A1 4 1
#2 A2 3 2