arranging columns based on numeric values in r - r

I need to arrange column names based on numbering.
Here is a short version of my dataset.
df <- data.frame(id = c(1,2,3),
raw_score = c(10,20,30),
a = c(1,1,1),
b = c(2,3,4),
c = c(4,6,7))
names(df) <- c("id","raw_score","2.2","2.3","2.1")
> df
id raw_score 2.2 2.3 2.1
1 1 10 1 2 4
2 2 20 1 3 6
3 3 30 1 4 7
How can I arrange the columns below?
> df
id raw_score 2.1 2.2 2.3
1 1 10 4 1 2
2 2 20 6 1 3
3 3 30 7 1 4

Maybe
df %>% dplyr::select(id, raw_score,stringr::str_sort(colnames(df[, 3:ncol(df)]), numeric = TRUE)) -> df

Related

is it possible to filter rows of one dataframe based on another dataframe?

is it possible to filter rows of one dataframe based on another dataframe?
I have this 2 dataframe:
df_node <- data.frame( id= c("a","b","c","d","e","f","g","h","i"),
group= c(1,1,1,2,2,2,3,3,3))
df_link <- data.frame(from = c("a","d","f","i","b"),
to = c("d","f","i","b","h"))
I would like to delete the lines with characters that are not present in the second dataframe, like this:
here is a basic way to do that:
df_node <- data.frame( id= c("a","b","c","d","e","f","g","h","i"),
group= c(1,1,1,2,2,2,3,3,3))
df_link <- data.frame(from = c("a","d","f","i","b"),
to = c("d","f","i","b","h"))
library(dplyr)
df_result <- df_node%>%
filter(id%in%c(df_link$from,df_link$to))
df_result
# > df_result
# id group
# 1 a 1
# 2 b 1
# 3 d 2
# 4 f 2
# 5 h 3
# 6 i 3
We could use a semi_join:
library(dplyr)
df_node |>
semi_join(tibble(id = c(df_link$from, df_link$to)))
Output:
id group
1 a 1
2 b 1
3 d 2
4 f 2
5 h 3
6 i 3
Here is a oneliner with base R:
df_node[df_node$id %in% unlist(df_link),]
id group
1 a 1
2 b 1
4 d 2
6 f 2
8 h 3
9 i 3
But you could also use a join:
library(dplyr)
df_uniqueID <- data.frame(id = unique(c(df_link$from,df_link$to)) )
right_join(df_node,df_uniqueID)
Joining, by = "id"
id group
1 a 1
2 b 1
3 d 2
4 f 2
5 h 3
6 i 3

Changing the row id conditional on a different column?

Hi I have a data set like this
df <- cbind(c("id",1,2,3,4,5,6,7,8,9,10,11), c("trial",1,1,1,1,1,1,2,2,2,2,2))
However, I want to change the ids conditional on the trial number - something like this
df1 <- cbind(c("id",1.1,1.2,1.3,1.4,1.5,1.6,2.1,2.2,2.3,2.4,2.5), c("trial",1,1,1,1,1,1,2,2,2,2,2))
I would really appreciate it if someone could help me with this. I am still learning R.
df <- data.frame(id = c(1,2,3,4,5,6,7,8,9,10,11), trial = c(1,1,1,1,1,1,2,2,2,2,2))
library(dplyr)
df |>
arrange(trial, id) |>
group_by(trial) |>
mutate(trial_id = row_number()) |>
ungroup() |>
mutate(id = as.numeric(paste0(trial, ".", trial_id)))
# OR use: tidyr::unite("id", c(trial, trial_id), sep = ".", remove = FALSE)
# A tibble: 11 × 3
id trial trial_id
<dbl> <dbl> <int>
1 1.1 1 1
2 1.2 1 2
3 1.3 1 3
4 1.4 1 4
5 1.5 1 5
6 1.6 1 6
7 2.1 2 1
8 2.2 2 2
9 2.3 2 3
10 2.4 2 4
11 2.5 2 5

Retrieving a column value in R by subsetting

I have this dataframe (df):
df <- data.frame(Data1 = c(1,3),
Data2 = c(3,9),
Data3 = c(7,2),
Data1Status = c(1,4),
Data2Status = c(2,5),
Data3Status = c(3,6),
NumberOfMaxValue = c(3,2))
Data1 Data2 Data3 Data1Status Data2Status Data3Status NumberOfMaxValue
1 3 7 1 2 3 3
3 9 2 4 5 6 2
And i want do get this new column:
Data1 Data2 Data3 Data1Status Data2Status Data3Status NumberOfMaxValue DataMaxStatus
1 3 7 1 2 3 3 3
3 9 2 4 5 6 2 5
I tried something like this:
DataMaxStatus = df[, as.numeric(df$NumberOfMaxValue) + 3], but it didn't work.
EDIT/EXPLANATION:
NumberOfMaxValue is the number of the biggest data (1, 2 or 3)
DataMaxStatus is the status of the greater number between Data1, Data2 e Data3
We can get the corresponding Status value by creating a matrix of row/column index to subset from Status columns.
cols <- grep('Status', names(df))
df$DataMaxStatus <- df[cols][cbind(1:nrow(df), df$NumberOfMaxValue)]
df
# Data1 Data2 Data3 Data1Status Data2Status Data3Status NumberOfMaxValue DataMaxStatus
#1 1 3 7 1 2 3 3 3
#2 3 9 2 4 5 6 2 5

Manipulating large dataset with dcast

Apologies if this is a repeat question but I could not find the specific answer I am looking for. I have a dataframe with counts of different species caught on a given trip. A simplified example with 5 trips and 4 species is below:
trip = c(1,1,1,2,2,3,3,3,3,4,5,5)
species = c("a","b","c","b","d","a","b","c","d","c","c","d")
count = c(5,7,3,1,8,10,1,4,3,1,2,10)
dat = cbind.data.frame(trip, species, count)
dat
> dat
trip species count
1 1 a 5
2 1 b 7
3 1 c 3
4 2 b 1
5 2 d 8
6 3 a 10
7 3 b 1
8 3 c 4
9 3 d 3
10 4 c 1
11 5 c 2
12 5 d 10
I am only interested in the counts of species b for each trip. So I want to manipulate this data frame so I end up with one that looks like this:
trip2 = c(1,2,3,4,5)
species2 = c("b","b","b","b","b")
count2 = c(7,1,1,0,0)
dat2 = cbind.data.frame(trip2, species2, count2)
dat2
> dat2
trip2 species2 count2
1 1 b 7
2 2 b 1
3 3 b 1
4 4 b 0
5 5 b 0
I want to keep all trips, including trips where species b was not observed. So I can't just subset the data by species b. I know I can cast the data so species are the columns and then just remove the columns for the other species like so:
library(dplyr)
library(reshape2)
test = dcast(dat, trip ~ species, value.var = "count", fun.aggregate = sum)
test
> test
trip a b c d
1 1 5 7 3 0
2 2 0 1 0 8
3 3 10 1 4 3
4 4 0 0 1 0
5 5 0 0 2 10
However, my real dataset has several hundred species caught on thousands of trips, and if I try to cast that many species to columns R chokes. There are way too many columns. Is there a way to specify in dcast that I only want to cast species b? Or is there another way to do this that doesn't require casting the data? Thank you.
Here is a data.table approach which I suspect will be very fast for you:
library(data.table)
setDT(dat)
result <- dat[,.(species = "b", count = sum(.SD[species == "b",count])),by = trip]
result
trip species count
1: 1 b 7
2: 2 b 1
3: 3 b 1
4: 4 b 0
5: 5 b 0
We can use tidyverse
library(dplyr)
library(tidyr)
dat %>%
filter(species == 'b') %>%
group_by(trip, species) %>%
summarise(count = sum(count)) %>%
ungroup %>%
complete(trip = unique(dat$trip), fill = list(species = 'b', count = 0))
# A tibble: 5 x 3
# trip species count
# <dbl> <chr> <dbl>
#1 1 b 7
#2 2 b 1
#3 3 b 1
#4 4 b 0
#5 5 b 0

summarise by group of columns using min and maintaing row number

I have a data frame with 3 columns
df <- data.frame(ID1=c(rep(1,4),rep(2,4)), ID2=rep(1:2,4), value=1:8)
I need to recover the min for each group (ID1, ID2) and the position(row.name) of this min in the original table.
Using group_by and summarise, I have obtained the min but I can't see a way to obtain the position as summarise gets rid of the columns not summarised and not used for group.
df<-data.frame(ID1=c(rep(1,4),rep(2,4)), ID2=rep(1:2,4), value=1:8)
df[['X']] <- paste0(df$ID1,'.',df$ID2)
df <- group_by( df, X )
df <- summarise( df, Objective=min(value) )
Any ideas on how to solve this to get?
X Objective Position
1 1.1 1 1
2 1.2 2 2
3 2.1 5 5
4 2.2 6 6
Thanks in advance
If I understand correct and since you're already using dplyr, you could do it like this:
library(dplyr); library(tidyr)
unite(df, X, ID1:ID2, sep = ".") %>%
mutate(Position = row_number()) %>%
group_by(X) %>% slice(which.min(value))
#Source: local data frame [4 x 3]
#Groups: X
#
# X value Position
#1 1.1 1 1
#2 1.2 2 2
#3 2.1 5 5
#4 2.2 6 6
Or alternatively (only dplyr) - I'd rather use this one:
mutate(df, Position = row_number()) %>% group_by(ID1, ID2) %>% slice(which.min(value))
#Source: local data frame [4 x 4]
#Groups: ID1, ID2
#
# ID1 ID2 value Position
#1 1 1 1 1
#2 1 2 2 2
#3 2 1 5 5
#4 2 2 6 6
data
df <- data.frame(ID1=rep(1:2, each = 4), ID2=rep(1:2,4), value=1:8)
Here's how would I approach this using data.table (rn would be your row number).
library(data.table)
setDT(df, keep.rownames = TRUE)[, .SD[which.min(value)], list(ID1, ID2)]
# ID1 ID2 rn value
# 1: 1 1 1 1
# 2: 1 2 2 2
# 3: 2 1 5 5
# 4: 2 2 6 6
Another option is ordering and then picking the unique values
unique(setorder(df, value), by = c("ID1", "ID2"))
# ID1 ID2 rn value
# 1: 1 1 1 1
# 2: 1 2 2 2
# 3: 2 1 5 5
# 4: 2 2 6 6
Both approaches don't require creating X column
Or using base R
df <- df[order(df$value), ]
df[!duplicated(df[, 1:2]), ]
# ID1 ID2 value
# 1 1 1 1
# 2 1 2 2
# 5 2 1 5
# 6 2 2 6
data
df <- data.frame(ID1=c(rep(1,4),rep(2,4)), ID2=rep(1:2,4), value=1:8)
Using Aggregate:
Data:
df<-data.frame(ID1=c(rep(1,4),rep(2,4)), ID2=rep(1:2,4), value=1:8)
df[['X']] <- paste0(df$ID1,'.',df$ID2)
df$rn<-row.names(df) #rn is the row number
df<-df[c("X","rn","value")]
#> df
# X rn value
#1 1.1 1 1
#2 1.2 2 2
#3 1.1 3 3
#4 1.2 4 4
#5 2.1 5 5
#6 2.2 6 6
#7 2.1 7 7
#8 2.2 8 8
Aggregate step:
df2<- aggregate(df, by=list(c(df$X)), min)
#> df2
# Group.1 X rn value
#1 1.1 1.1 1 1
#2 1.2 1.2 2 2
#3 2.1 2.1 5 5
#4 2.2 2.2 6 6

Resources