Find first value in table in R - r

I have two columns MPH, Threshold, Car. I’d like to write some code to return the MPH for the column car when the first instance of threshold is 1.
MPH Threshold Car
30 0 A
31 0 A
32 1 A
33 1 A
34 1 A
35 1 A
30 0 B
31 0 B
32 0 B
33 0 B
34 1 B
35 1 B
Desired Output:
Value Car
32 A
34 B

Assuming you'll always have at-least one value where Threshold = 1 for each Car we can do
library(dplyr)
df %>%
group_by(Car) %>%
slice(which.max(Threshold == 1)) %>%
select(-Threshold)
# MPH Car
# <int> <fct>
#1 32 A
#2 34 B
Of using base R ave
df[with(df, ave(Threshold == 1, Car, FUN = function(x)
seq_along(x) == which.max(x))), ]

You can also do
library(dplyr)
df %>%
filter(Threshold == 1) %>%
subset(!duplicated(Car))
library(data.table)
dt <- data.table(df)
dt[Threshold == 1, ][!duplicated(Car),]

An option with data.table
library(data.table)
i1 <- setDT(df)[, .I[which(Threshold == 1)[1]], Car]$V1
df[i1, .(Value = MPH, Car)]
# Value Car
#1: 32 A
#2: 34 B
data
df <- structure(list(MPH = c(30L, 31L, 32L, 33L, 34L, 35L, 30L, 31L,
32L, 33L, 34L, 35L), Threshold = c(0L, 0L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 1L, 1L), Car = c("A", "A", "A", "A", "A", "A", "B",
"B", "B", "B", "B", "B")), class = "data.frame", row.names = c(NA,
-12L))

Related

How to group contiguous variable into a range r

I have an example dataset:
Road Start End Cat
1 0 50 a
1 50 60 b
1 60 90 b
1 70 75 a
2 0 20 a
2 20 25 a
2 25 40 b
Trying to output following:
Road Start End Cat
1 0 50 a
1 50 90 b
1 70 75 a
2 0 25 a
2 25 40 b
My code doesn't work:
df %>% group_by(Road, cat)
%>% summarise(
min(Start),
max(End)
)
How can I achieve the results I wanted?
We can use rleid from data.table to get the run-length-id-encoding for grouping and then do the summarise
library(dplyr)
library(data.table)
df %>%
group_by(Road, grp = rleid(Cat)) %>%
summarise(Cat = first(Cat), Start = min(Start), End = max(End)) %>%
select(-grp)
# A tibble: 5 x 4
# Groups: Road [2]
# Road Cat Start End
# <int> <chr> <int> <int>
#1 1 a 0 50
#2 1 b 50 90
#3 1 a 70 75
#4 2 a 0 25
#5 2 b 25 40
Or using data.table methods
library(data.table)
setDT(df)[, .(Start = min(Start), End = max(End)), .(Road, Cat, grp = rleid(Cat))]
data
df <- structure(list(Road = c(1L, 1L, 1L, 1L, 2L, 2L, 2L), Start = c(0L,
50L, 60L, 70L, 0L, 20L, 25L), End = c(50L, 60L, 90L, 75L, 20L,
25L, 40L), Cat = c("a", "b", "b", "a", "a", "a", "b")),
class = "data.frame", row.names = c(NA,
-7L))

Summarise a group value into single row

I have a large dataset with longitudinal readings from single individuals.
I want to summarise information over time into a binary variable. i.e. if diff in the input table below is >5 for any value I want to then reduce the observation for A to a new column saying TRUE.
#Input
individual val1 val2 diff
A 32 36 -4
A 36 28 8
A 28 26 2
A 26 26 0
B 65 64 1
B 58 59 -1
B 57 54 3
B 54 51 3
#Output
individual newval
A TRUE
B FALSE
Using dplyr you can:
library(dplyr)
df %>%
group_by(individual) %>% # first group data
summarize(newval = any(diff > 5)) # then evaluate test for each group
#> # A tibble: 2 x 2
#> individual newval
#> <fct> <lgl>
#> 1 A TRUE
#> 2 B FALSE
data
df <- read.table(text = "individual val1 val2 diff
A 32 36 -4
A 36 28 8
A 28 26 2
A 26 26 0
B 65 64 1
B 58 59 -1
B 57 54 3
B 54 51 3
", header = TRUE)
Multiple ways to do this :
In base R we can use aggregate
aggregate(diff~individual, df,function(x) any(x>5))
# individual diff
#1 A TRUE
#2 B FALSE
Or tapply
tapply(df$diff > 5, df$individual, any)
We can also use data.table
library(data.table)
setDT(df)[ ,(newval = any(diff > 5)), by = individual]
An option in base R with rowsum
rowsum(+(df1$diff > 5), df1$individual) != 0
or with by
by(df1$diff > 5, df1$individual, any)
data
df1 <- structure(list(individual = c("A", "A", "A", "A", "B", "B", "B",
"B"), val1 = c(32L, 36L, 28L, 26L, 65L, 58L, 57L, 54L), val2 = c(36L,
28L, 26L, 26L, 64L, 59L, 54L, 51L), diff = c(-4L, 8L, 2L, 0L,
1L, -1L, 3L, 3L)), class = "data.frame", row.names = c(NA, -8L
))

Loop to create bivariate/cross table

I am trying to create a loop where I want get the frequency between column 1 and column 2,column 1 and column 3....till col1 and col30.
Col1 col2 col3
0 A 25
1 A 30
0 A 30
1 B 20
0 B 20
Output.
0 1 0 1
A 2 1 25 0 0
B 1 1 30 1 1
20 1 1
Use lapply to loop over columns and then table to calculate frequency
lapply(df[-1], function(x) table(x, df[, 1]))
#$col2
#x 0 1
# A 2 1
# B 1 1
#$col3
#x 0 1
# 20 1 1
# 25 1 0
# 30 1 1
Or a shorter version using Map
Map(table, df[1], df[-1])
data
df <- structure(list(Col1 = c(0L, 1L, 0L, 1L, 0L), col2 = structure(c(1L,
1L, 1L, 2L, 2L), .Label = c("A", "B"), class = "factor"), col3 = c(25L,
30L, 30L, 20L, 20L)), class = "data.frame", row.names = c(NA, -5L))
We can use tidyverse
library(tidyverse)
map(names(df)[-1], ~ cbind(df[1], df[.x]) %>%
count(Col1, !! rlang::sym(.x)) %>%
spread(Col1, n, fill = 0))
data
df <- structure(list(Col1 = c(0L, 1L, 0L, 1L, 0L), col2 = structure(c(1L,
1L, 1L, 2L, 2L), .Label = c("A", "B"), class = "factor"), col3 = c(25L,
30L, 30L, 20L, 20L)), class = "data.frame", row.names = c(NA, -5L))

R filling zeros with mean value

I have this table right now that looks like this:
Worker | Score
A | 10
A | 20
A | 0
A | 0
A | 0
B | 2
B | 4
B | 0
B | 6
Right now some of my scores which are unavailable I've filled them with 0. Is there a way on R where I can replace those 0 values with the mean value of the particular worker's score. The final table should look like this:
Worker | Score
A | 10
A | 20
A | 15 (mean of other scores)
A | 15 (mean of other scores)
A | 15 (mean of other scores)
B | 2
B | 4
B | 4 (mean of other scores)
B | 6
Right now I am thinking of looping through but I have 100 of thousands of entries which would make it very slow and inefficient.
Use ave to find the averages for each Worker and then use replace to substitute the relevant values
replace(x = df$Score, list = df$Score == 0, values =
ave(df$Score, df$Worker, FUN = function(x) sum(x, na.rm = TRUE)/sum(x!=0))[df$Score == 0])
#[1] 10 20 15 15 15 2 4 4 6
DATA
df = structure(list(Worker = c("A", "A", "A", "A", "A", "B", "B",
"B", "B"), Score = c(10L, 20L, 0L, 0L, 0L, 2L, 4L, 0L, 6L)), .Names = c("Worker",
"Score"), class = "data.frame", row.names = c(NA, -9L))
One option is na.aggregate from base R. Replace the 0 values in 'score' by NA, grouped by 'Worker', apply the na.aggregate on 'Score' to replace the 'NA' by the mean of the 'Score' by assigning it to 'Score'
library(data.table)
library(zoo)
setDT(df1)[Score ==0, Score := NA ][, .(Score = na.aggregate(Score)), by = Worker]
# Worker Score
#1: A 10
#2: A 20
#3: A 15
#4: A 15
#5: A 15
#6: B 2
#7: B 4
#8: B 4
#9: B 6
Or it can be made more compact by
setDT(df1)[, .(Score = na.aggregate(Score*NA^!Score)), Worker]
data
df1 <- structure(list(Worker = c("A", "A", "A", "A", "A", "B", "B",
"B", "B"), Score = c(10L, 20L, 0L, 0L, 0L, 2L, 4L, 0L, 6L)),
.Names = c("Worker",
"Score"), class = "data.frame", row.names = c(NA, -9L))
Here is another solution with data.table
library("data.table")
df1 <- data.table(Worker = c("A", "A", "A", "A", "A", "B", "B", "B", "B"),
Score = c(10L, 20L, 0L, 0L, 0L, 2L, 4L, 0L, 6L))
m <- df1[Score!=0, mean(Score), Worker]
m[df1, on="Worker"][, `:=`(Score=ifelse(Score==0, V1, Score), V1=NULL)][]

R Programming: Calculate two-sample t-test for a data frame with formatting

Good evening,
I have the following data frame:
Sex A B C D E
M 1 20 45 42 12
F 2 10 32 23 43
M 39 32 2 23 43
M 24 43 2 44 12
F 11 3 4 4 11
How would I calculate the two-sample t-test for each numerical variable for the data frame listed above by the sex variable by using the apply function. The result should be a matrix that contains five
columns: F.mean (mean of the numerical variable for Female), M.mean (mean of the numerical variable
for Male), t (for t-statistics), df (for degrees of freedom), and p (for p-value).
Thank you!!
Here is an option using apply with margin 2
out = apply(data[,-1], 2, function(x){
unlist(t.test(x[data$Sex == 'M'], x[data$Sex == 'F'])[c(1:3,5)],
recursive=FALSE)
})
#> out
# A B C D E
#statistic.t 1.2432059 3.35224633 -0.08318328 1.9649783 -0.2450115
#parameter.df 2.5766151 2.82875770 2.70763487 1.9931486 1.8474695
#p.value 0.3149294 0.04797862 0.93946696 0.1887914 0.8309453
#estimate.mean of x 21.3333333 31.66666667 16.33333333 36.3333333 22.3333333
#estimate.mean of y 6.5000000 6.50000000 18.00000000 13.5000000 27.0000000
data
data = structure(list(Sex = structure(c(2L, 1L, 2L, 2L, 1L), .Label = c("F",
"M"), class = "factor"), A = c(1L, 2L, 39L, 24L, 11L), B = c(20L,
10L, 32L, 43L, 3L), C = c(45L, 32L, 2L, 2L, 4L), D = c(42L, 23L,
23L, 44L, 4L), E = c(12L, 43L, 43L, 12L, 11L)), .Names = c("Sex",
"A", "B", "C", "D", "E"), class = "data.frame", row.names = c(NA,
-5L))
should be a combination of apply, t.test and aggregate, I think. But first turn the row names into a names colums. Then you can do subsetting with aggregate and then apply with t.test

Resources