I have the following table:
df<-cbind(c(35780,35780,35780,35800,35800,35800,35800,35830,35830,35830,35830),c("Semi_urban","Rural","Urban","Multiurban","Semi_urban","Rural","Urban","Multiurban","Semi_urban","Rural","Urban"),c(0,0,2,0,0,0,11,1,1,0,0))
df
colnames(df)<-c("Zip_Code","Zone_type","Freq")
I want to extract the raws with Maximum for each Zip Code, if the Maximum is repeated I want to extract both lines:
df.final<-cbind(c(35780,35800,35830,35830),c("Urban","Urban","Multiurban","Semi-urban"),c(2,11,1,1))
df.final
I assume you need matrix output. Does this work:
library(dplyr)
library(tibble)
df %>% as.tibble() %>% type.convert(as.is = T) %>% group_by(Zip_Code) %>% slice_max(Freq) %>% as.matrix()
Zip_Code Zone_type Freq
[1,] "35780" "Urban" " 2"
[2,] "35800" "Urban" "11"
[3,] "35830" "Multiurban" " 1"
[4,] "35830" "Semi_urban" " 1"
The data.table equivalent:
library(data.table)
as.data.table(df)[,.SD[Freq == max(Freq)],by = Zip_Code]
Zip_Code Zone_type Freq
1: 35780 Urban 2
2: 35800 Urban 11
3: 35830 Multiurban 1
4: 35830 Semi_urban 1
In Karthik's answer you can also do filter(Freq == max(Freq)) instead of slice_max(Freq), which is more intuitive I think.
Apart from the solutions of the other users, there is also one that only uses base R.
df <- cbind(c(35780,35780,35780,35800,35800,35800,35800,35830,35830,35830,35830),
c("Semi_urban","Rural","Urban","Multiurban","Semi_urban","Rural","Urban","Multiurban","Semi_urban","Rural","Urban"),
c(0,0,2,0,0,0,11,1,1,0,0))
df <- as.data.frame(df)
colnames(df) <- c("Zip_Code","Zone_type","Freq")
df$Zip_Code <- as.character(df$Zip_Code)
df$Freq <- as.integer(as.character(df$Freq))
df$Zone_type <- as.character(df$Zone_type)
ls.result <- lapply(split(df, f = df$Zip_Code),
function(df.for.zip.code)
{
subset(df.for.zip.code,
subset = Freq == max(Freq))
})
df.result <- Reduce(function(...) merge(..., all=TRUE), ls.result)
df.result
As suggested by the title, I would like to extract values from other rows.
In particular, as an example please consider the following dataset:
id.in.group <- c(1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3)
group <- c(1,1,1,2,2,2,3,3,3,4,4,4,1,1,1,2,2,2,3,3,3,4,4,4,1,1,1,2,2,2,3,3,3,4,4,4)
trial <- c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3)
subject <- c("s7","s11","s3","s6","s9","s4","s12","s10","s1","s8","s2","s5","s5","s9","s6","s10","s1","s3","s4","s7","s2","s8","s12","s11","s5","s3","s9","s12","s11","s10","s1","s6","s7","s4","s2","s8")
df <- data.frame(group, id.in.group, trial, subject)
df$other1.id <- 0
df$other2.id <- 0
df$other1.id <- ifelse(df$id.in.group == "1" , 2, df$other1.id)
df$other2.id <- ifelse(df$id.in.group == "1" , 3, df$other2.id)
df$other1.id <- ifelse(df$id.in.group == "2" , 1, df$other1.id)
df$other2.id <- ifelse(df$id.in.group == "2" , 3, df$other2.id)
df$other1.id <- ifelse(df$id.in.group == "3" , 1, df$other1.id)
df$other2.id <- ifelse(df$id.in.group == "3" , 2, df$other2.id)
View(df)
Given the group number (df$group) and the id of the others in the group (df$other1.id and df$other2.id), I would like to create two further variables showing, for each trial and each subject, the value of the other 2 subjects rather than their relative id.in.group, so as to get the two following columns
df$other1.subject<-c("s11","s7","s7","s9","s6","s6","s10","s12","s12","s2","s8","s8","s9","s5","s5","s1","s10","s10","s7","s4","s4","s12","s8","s8", "s3","s5","s5","s11","s12","s12","s6","s1","s1","s2","s4","s4")
df$other2.subject<-c("s3","s3","s11","s4","s4","s9","s1","s1","s10","s5","s5","s2","s6","s6","s9","s3","s3","s1","s2","s2","s7","s11","s11","s12","s9","s9","s3","s10","s10","s11","s7","s7","s6","s8","s8","s2")
View(df)
For instance, if trial = 1 and id.in.group = 1 (or alternatively, subject = s7), then other1.subject = s11 while other2.subject = s3. I would like to extract such values for each id.in.group (or each subject) or for each row.
I beg you a pardon if I don't provide any previous attempt but, honestly, I have no clue about how to tackle the problem. I remain open to any further clarification.
Many thanks for all your help!
You need to left join df with itself two times - one for other1, second for other2:
library(dplyr)
df %>%
left_join(
df %>%
select(group, trial, other1.id = id.in.group, other1.subject = subject),
by = c("group", "trial", "other1.id")
) %>%
left_join(
df %>%
select(group, trial, other2.id = id.in.group, other2.subject = subject),
by = c("group", "trial", "other2.id")
)
I want to turn a table into a data frame. Three columns should be there: 1. the zip code 2 outcome "0" and 3 outcome "1". But as.data.frame.matrix turns the zip-code into row names and makes them unusable.
I tried to add a fourth column with imaginary ID's (1:100) so R makes them to row names but R tells me, that "all arguments must be the same length" - which they are!
id <- 1:5000
zip <- sample(100:200, 5000, replace = TRUE)
outcome <- rbinom(5000, 1, 0.23)
df <- data.frame(id, outcome, zip)
abs <- table(df$zip, df$outcome)
abs <- as.data.frame.matrix(abs)
Some has a nice and slick idea? Thanks in advance!
Edit:
When:
abs <- as.matrix(as.data.frame(abs))
I get something close to what I want but the outcomes are together in one column. How to untie them, to make them look like the table again?
You can get to your desired result easier with dplyr and tidyr:
library(dplyr)
library(tidyr)
id <- 1:5000
zip <- sample(100:200, 5000, replace = TRUE)
outcome <- rbinom(5000, 1, 0.23)
df <- data.frame(id, outcome, zip)
df <- df %>% group_by(zip, outcome) %>%
summarise(freq = n()) %>%
ungroup() %>%
spread(outcome, freq)
You are supplying only a 100 values to a data.frame that has 101 rows.
> nrow(abs)
[1] 101
so this would work
abs$new_col <- 1:101
I think you want this:
abs2 <- as.data.frame(abs) %>% select(2,3,1)
Can I find out if there is any code that creates a table or data frame that binds multiple tables for me?
table(df$col1)
table(df$col1,df$col2<0)
table(df$col1,df$col3>0)
table(df$col1,df$col4>0)
In the above example, I am grouping my dataset based on similar values in df$col1 and displaying data that satisfy the condition df$col2<0. What I get is a true and false matrix of the number of records fulfilling the condition and not. I want a combined table that still groups the data by df$col1 and shows the true condition for df$col2<0,df$col3>0 and df$col4>0 in the same table.
Based on the description, we could do a cbind
r1 <- cbind(table(df$col1), table(df$col1,df$col2<0)[,2],
table(df$col1,df$col3>0)[,2], table(df$col1,df$col4>0)[,2])
If there are many columns, this can be done by looping
r2 <- do.call(cbind, c(list(col1 = table(df$col1)), Map(function(x, y)
table(df$col1, get(y)(x, 0))[,2], df[-1], c("<", ">", ">"))))
all.equal(r1, r2, check.attributes = FALSE)
#[1] TRUE
We can also do this with group by operations.
library(dplyr)
df %>%
mutate(col2 = col2 < 0) %>%
mutate_at(3:4, funs(. > 0)) %>%
group_by(col1) %>%
mutate(n = n()) %>%
group_by(n, add = TRUE) %>%
summarise_all(sum)
data
set.seed(24)
df <- as.data.frame(matrix(sample(-2:5, 10*4, replace = TRUE), ncol=4))
names(df) <- paste0("col", 1:4)
I have a data set with a lot of values. The majority of x matches a value in y uniquely. However some of x match multiple ys. Is there an easy way to find which values of y map to multiple xs?
mydata <- data.frame(x = c(letters,letters), y=c(LETTERS,LETTERS))
mydata$y[c(3,5)] <- "A"
mydata$y[c(10,15)] <- "Z"
mydata %>% foo
[1] "A" "Z"
I apologize if I am missing some obvious command here.
Using dplyr, you can do:
library(dplyr)
mydata <- data.frame(x = letters, y=LETTERS, stringsAsFactors = FALSE)
mydata$y[c(3,5)] <- "A"
mydata$y[c(10,15)] <- "Z"
mydata %>% group_by(y) %>% filter(n() > 1)
If you want to extract just the y values, you can store that to a data frame like this and find unique y values:
df <- mydata %>% group_by(y) %>% filter(n() > 1)
unique(df$y)
Another alternative format to get the same output into is as follows. This returns a single column data frame instead of a vector as above.
mydata %>% group_by(y) %>% filter(n() > 1) %>% select(y) %>% distinct()
use data.table
library(data.table)
setDT(mydata)
mydata[,list(n=length(unique(x))), by=y][n>2,]
# y n
# 1: A 3
# 2: Z 3
If we need the corresponding unique values in 'x'
library(data.table)
setDT(mydata)[,if(.N >2) toString(unique(.SD[[1L]])) , y]
# y V1
#1: A a, c, e
#2: Z j, o, z