French student here, so my english's not that great, sorry.
We transformed a data set with species and their locations, to the corresponding origin of theses species and their locations.
The data set has 600~ columns, named U, A, W, L or E (species origin), inside of which a 0 or 1 (presence / absence of a species at location)
and 2 columns with coordonates (corresponding to the data collecting station).
More than 8000 lines, for each station where data was found.
A simplification of the data set would like that :
[Longitude] [Latitude] [A][U][U][L][E][A][U] ... [+600]
[1,] -5.89 35.71 0 0 1 0 0 1 1
[2,] -5.89 35.81 0 1 0 0 0 0 1
[3,] -5.89 36.01 1 0 0 1 1 1 0
[4,] -5.89 36.1 0 0 0 1 0 1 0
[1,] -5.89 36.21 1 1 1 0 0 1 1
[2,] -5.79 35.81 1 1 0 1 0 1 0
[3,] -5.79 35.91 0 1 0 0 0 0 1
[4,] -5.79 36.01 1 1 0 1 0 1 0
[+8000]
What we want to do is to some sort of conditional sum, where all origin are regrouped into one column each and their content summed , like so :
`
[Longitude] [Latitude] [A][U][L][W][E]
[1,] -5.89 35.71 12 6 5 0 13
[2,] -5.89 35.81 5 1 8 10 20
[3,] -5.89 36.01 1 28 3 6 2
[4,] -5.89 36.1 4 25 0 1 11
[1,] -5.89 36.21 9 1 9 3 5
[2,] -5.79 35.81 6 5 12 1 8
[3,] -5.79 35.91 5 2 7 15 10
[4,] -5.79 36.01 10 3 5 12 4
[+8000]
Only the A,U,L,E,W must be summed.
Longitude, Latitude and number of rows must ne kept the same.
We tried aggregate or tapply, without success, but maybe a loop is needed...
Any ideas ?
a capture of the data set
MacOs answer
MacOS answer 2
Thanks
MacOS function : espOri => df espagg => df.agg
espagg <- aggregate(. ~ Longitude + Latitude,
especeOri,
FUN = sum)
aggregate.columns <- function(especeOri, column.names)
{
for (column.name in column.names) {
especeOri[[column.name]] <- rowSums(subset(espagg, select = grep(paste(column.name, ".*", sep = ""), colnames(especeOri))))
}
return(especeOri)
}
aggregate.column.names <- c("A", "U", "L", "E", "W")
espagg <- aggregate.columns(espagg, aggregate.column.names)
espagg <- subset(especeOri, select = c("Longitude", "Latitude", aggregate.column.names))
View(espagg)
dput of the data set
dput(especeOri[1:10,1:20])
structure(list(Longitude = c(-5.89, -5.89, -5.89, -5.89, -5.89,
-5.79, -5.79, -5.79, -5.79, -5.69), Latitude = c(35.71, 35.81,
36.01, 36.11, 36.21, 35.81, 35.91, 36.01, 36.11, 35.81), L = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), U.1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L), A = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.2 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), E = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), U.3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), E.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.4 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.5 = c(0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L), U.6 = c(1L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 1L), L.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
U.7 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), U.8 = c(0L,
0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L), U.9 = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), U.10 = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), A.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), U.11 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L)), row.names = c(NA, 10L), class = "data.frame")
google drive with all the data sets, a few explanations and our script.
https://drive.google.com/drive/folders/1fnWnZZDC3gyWTtSoqi_l7Wuhje5qpJmL?usp=sharing
EDIT : added some values for longitude and latitude to illustrate and a screenshot
Here is a tidyverse solution using the data you provided.
library(dplyr)
library(tidyr)
fish <- read.table("Data_fish.txt", header = T)
traits <- read.table("Data_traits.txt", header = T)
fish %>%
pivot_longer(-c(ID_cellule, Longitude, Latitude), names_to = "Species", values_to = "Occur") %>%
mutate(ID_cellule = factor(ID_cellule, levels = unique(ID_cellule))) %>% # use factor to fix the display order as-is
left_join(traits %>% select(Species, Origin), by = "Species") %>%
group_by(ID_cellule, Longitude, Latitude, Origin) %>%
summarise(Occur = sum(Occur)) %>%
pivot_wider(names_from = "Origin", values_from = "Occur")
Output
# A tibble: 8,154 x 8
# Groups: ID_cellule, Longitude, Latitude [8,154]
ID_cellule Longitude Latitude A E L U W
<fct> <dbl> <dbl> <int> <int> <int> <int> <int>
1 ID1 -5.89 35.7 8 10 0 178 0
2 ID2 -5.89 35.8 11 10 0 234 0
3 ID3 -5.89 36.0 9 11 0 195 0
4 ID4 -5.89 36.1 12 10 0 227 0
5 ID5 -5.89 36.2 13 17 0 268 0
6 ID6 -5.79 35.8 9 8 0 205 0
7 ID7 -5.79 35.9 8 9 0 168 0
8 ID8 -5.79 36.0 11 14 0 262 0
9 ID9 -5.79 36.1 10 10 0 193 0
10 ID10 -5.69 35.8 9 10 0 230 0
The following should do the job.
df <- data.frame(Longitude = c(-5.89, -5.89, -5.89, -5.89, -5.89, -5.79, -5.79, -5.79, -5.89, -5.89),
Latitude = c(35.71, 35.81, 36.01, 36.1, 36.21, 35.81, 35.91, 36.01, 35.71, 35.81),
A = c(0, 0, 1, 0, 1, 1, 0, 1, 1, 1),
U = c(0, 1, 0, 0, 1, 1, 1, 1, 1, 1),
U = c(1, 0, 0, 0, 1, 0, 0, 0, 1, 1),
L = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1),
E = c(0, 0, 1, 0, 0, 0, 0, 0, 1, 1),
A = c(1, 0, 1, 1, 1, 1, 0, 1, 1, 1),
U = c(1, 1, 0, 0, 1, 0, 1, 0, 1, 1))
df.agg <- aggregate(. ~ Longitude + Latitude,
df,
FUN = sum)
df.agg$A <- rowSums(subset(df.agg, select = grep("A.*", colnames(df.agg))))
df.agg$U <- rowSums(subset(df.agg, select = grep("U.*", colnames(df.agg))))
df.agg$L <- rowSums(subset(df.agg, select = grep("L.*", colnames(df.agg))))
df.agg$E <- rowSums(subset(df.agg, select = grep("E.*", colnames(df.agg))))
df.agg <- subset(df.agg, select = c(Longitude, Latitude, A, U, L, E))
Update
The OP user asked for a solution where he/she does not have to write the code for rowSums explicitely, because he/she has to many columns to actually write it out, i.e. it is inconvinient. The following should do the job.
df <- structure(list(Longitude = c(-5.89, -5.89, -5.89, -5.89, -5.89,
-5.79, -5.79, -5.79, -5.79, -5.69), Latitude = c(35.71, 35.81,
36.01, 36.11, 36.21, 35.81, 35.91, 36.01, 36.11, 35.81), L = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), U.1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L), A = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.2 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), E = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), U.3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), E.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.4 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), U.5 = c(0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L), U.6 = c(1L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 1L), L.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
U.7 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), U.8 = c(0L,
0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L), U.9 = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), U.10 = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), A.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), U.11 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L)), row.names = c(NA, 10L), class = "data.frame")
df.agg <- aggregate(. ~ Longitude + Latitude,
df,
FUN = sum)
# This function aggregates rows if their column names have the same start.
# Suppose we have a data frame with column names A, A.1, and A.2. Then,
# the rows of these columns are aggregated using sum. So,
# A 1 1 0
# A.1 2 1 0
# A.2 0 0 1
# becomes
# A 3 2 1
aggregate.columns <- function(df.my, column.names)
{
for (column.name in column.names) {
df.my[[column.name]] <- df.my[[column.name]] +
rowSums(subset(df.my,
select = grep(paste(column.name, ".[1-9]+", sep = ""),
colnames(df.my))))
}
return(df.my)
}
aggregate.column.names <- c("A", "U", "L", "E")
df.agg <- aggregate.columns(df.agg, aggregate.column.names)
df.agg <- subset(df.agg, select = c("Longitude", "Latitude", aggregate.column.names))
df.agg
The key to making this work is this line.
grep(paste(column.name, ".[1-9]+", sep = ""), colnames(df.my))
It returns all column names that have start with the current value of variable column.name followed by a dot and any sequence of digits, e.g. when the value of column.name is A then A.1, A.345, A.67, A.9798, A.111111 should all be returned. Please check!
Update 3
After the user of the OP provided the data, I did come up with the following. This includes a function for renaming. This is necessary since the data frame has columns with identical names. For example, this function transforms a sequence of column names A, A, A, A into A, A.1, A.2, A.3.
climate <- read.table("Data_climate.txt", header = T)
poissons <- read.table("Data_fish.txt", header = T)
traitsNA <- read.table("Data_traits.txt", header = T)
especes <- poissons [,-2]
especes2 <- especes [,-2]
especes3 <- especes2 [,-1]
colnames(especes3) <- traitsNA$Origin
especes44<-cbind(climate$Latitude,especes3)
especeOri <- cbind(climate$Longitude,especes44)
origine <- cbind(climate$ID_cellule,especeOri)
colnames(origine)[1] <- "ID_cellule"
colnames(origine)[2] <- "Longitude"
colnames(origine)[3] <- "Latitude"
colnames(especeOri)[1] <- "Longitude"
colnames(especeOri)[2] <- "Latitude"
rename.columns <- function(df)
{
unique.column.names <- unique(colnames(df))
for (unique.column.name in unique.column.names)
{
idxs.columns <- which(colnames(df) == unique.column.name)
df.tmp.with.new.col.names <- subset(df, select = idxs.columns)
colnames(df)[idxs.columns] <- colnames(df.tmp.with.new.col.names)
}
return(df)
}
especeOri <- rename.columns(especeOri)
espagg <- aggregate(. ~ Longitude + Latitude,
especeOri,
FUN = sum)
# This function aggregates rows if their column names have the same start.
# Suppose we have a data frame with column names A, A.1, and A.2. Then,
# the rows of these columns are aggregated using sum. So,
# A 1 1 0
# A.1 2 1 0
# A.2 0 0 1
# becomes
# A 3 2 1
aggregate.columns <- function(df.my, column.names)
{
for (column.name in column.names) {
df.my[[column.name]] <- df.my[[column.name]] +
rowSums(subset(df.my,
select = grep(paste(column.name, ".[1-9]+",
sep = ""),
colnames(df.my))))
}
return(df.my)
}
aggregate.column.names <- c("A", "U", "L", "E", "W")
espagg <- aggregate.columns(espagg, aggregate.column.names)
espagg <- subset(especeOri, select = c("Longitude", "Latitude", aggregate.column.names))
HTH!
I would like to calculate the percentage of people reported doing some work per day. For example I would like to know the percentage of people reported doing some work on Monday from the entire sample.
I used the following code to calculate this, but I am not sure about my result.
df1 <- structure(list(id = c(12L, 123L, 10L), t1_1 = c(0L, 0L, 1L),
t1_2 = c(1L, 0L, 1L), t1_3 = c(1L, 0L, 1L), t2_1 = c(0L,
1L, 1L), t2_2 = c(1L, 1L, 1L), t2_3 = c(0L, 1L, 1L), t3_1 = c(1L,
0L, 1L), t3_2 = c(0L, 0L, 1L), t3_3 = c(1L, 0L, 1L), t4_1 = c(0L,
1L, 1L), t4_2 = c(1L, 1L, 1L), t4_3 = c(0L, 1L, 1L), t5_1 = c(0L,
1L, 1L), t5_2 = c(1L, 1L, 1L), t5_3 = c(0L, 1L, 1L), t6_1 = c(1L,
0L, 1L), t6_2 = c(1L, 0L, 1L), t6_3 = c(1L, 0L, 1L), t7_1 = c(0L,
1L, 1L), t7_2 = c(0L, 1L, 1L), t7_3 = c(1L, 1L, 1L)),
class = "data.frame", row.names = c(NA, -3L))
Variable description t1 - Monday (t1_1, t1_2, t1_3 - are time steps that measured if work was done during Monday); t2 - Tuesday; t3 - Wednesday; t4 - Thursday; t5 - Friday; t6 - Saturda and t7- Sunday; id is an identification number
df2 <- reshape2::melt(df1, id.vars = "id")
df2$variable <- as.character(df2$variable)
df2$day <- sapply(strsplit(df2$variable, "_"), `[`, 1)
df2$day <- factor(df2$day, levels = variable)
df3<-df2 %>%
group_by (day) %>%
mutate (percent = (value/sum(value) *100))
ggplot(df3, aes(day, group = value)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_fill_discrete(name="Days", labels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")) +
scale_y_continuous(labels=scales::percent, limits=c(0,1)) +
ylab("relative frequencies") +
theme_bw()
Result:
library(dplyr)
df1 <- structure(
list(id = c(12L, 123L, 10L),
t1_1 = c(0L, 0L, 1L), t1_2 = c(1L, 0L, 1L), t1_3 = c(1L, 0L, 1L),
t2_1 = c(0L, 1L, 1L), t2_2 = c(1L, 1L, 1L), t2_3 = c(0L, 1L, 1L),
t3_1 = c(1L, 0L, 1L), t3_2 = c(0L, 0L, 1L), t3_3 = c(1L, 0L, 1L),
t4_1 = c(0L, 1L, 1L), t4_2 = c(1L, 1L, 1L), t4_3 = c(0L, 1L, 1L),
t5_1 = c(0L, 1L, 1L), t5_2 = c(1L, 1L, 1L), t5_3 = c(0L, 1L, 1L),
t6_1 = c(1L, 0L, 1L), t6_2 = c(1L, 0L, 1L), t6_3 = c(1L, 0L, 1L),
t7_1 = c(0L, 1L, 1L), t7_2 = c(0L, 1L, 1L), t7_3 = c(1L, 1L, 1L)),
class = "data.frame", row.names = c(NA, -3L))
df2 <- reshape2::melt(df1, id.vars = "id")
df2$variable <- as.character(df2$variable)
df2$day <- sapply(strsplit(df2$variable, "_"), `[`, 1)
df3 <- df2 %>%
group_by(id, day) %>%
summarize(count = sum(value)) %>%
group_by(id) %>%
mutate(percent = count / sum(count)) %>%
arrange(day, id)
> df3
# A tibble: 21 x 4
# Groups: id [3]
id day count percent
<int> <chr> <int> <dbl>
1 10 t1 3 0.143
2 12 t1 2 0.182
3 123 t1 0 0
4 10 t2 3 0.143
5 12 t2 1 0.0909
6 123 t2 3 0.25
...
Is it something you are looking for?
I have a datset that looks something like this:
> head(BurnData)
Treatment Gender Race Surface head buttock trunk up.leg low.leg resp.tract type ex.time excision antib.time antibiotic
1 0 0 0 15 0 0 1 1 0 0 2 12 0 12 0
2 0 0 1 20 0 0 1 0 0 0 4 9 0 9 0
3 0 0 1 15 0 0 0 1 1 0 2 13 0 13 0
4 0 0 0 20 1 0 1 0 0 0 2 11 1 29 0
5 0 0 1 70 1 1 1 1 0 0 2 28 1 31 0
6 0 0 1 20 1 0 1 0 0 0 4 11 0 11 0
inf.time infection
1 12 0
2 9 0
3 7 1
4 29 0
5 4 1
6 8 1
I want to run a Cox's Regression on variables Surface, ex.time and, antib.time and treatment. Treatment is an indicator variable. Surface denotes the % of body burned. ex.time and antib.time both record time to event in days.
I am aware that to run a time dependent Cox's Regression i need to convert the data in longitudinal structure, but how can i do it in R?
then i will use the forluma:
coxph(formula = Surv(tstart, tstop, infection) ~ covariate)
DATA
> dput(head(BurnData))
structure(list(Treatment = c(0L, 0L, 0L, 0L, 0L, 0L), Gender = c(0L,
0L, 0L, 0L, 0L, 0L), Race = c(0L, 1L, 1L, 0L, 1L, 1L), Surface = c(15L,
20L, 15L, 20L, 70L, 20L), head = c(0L, 0L, 0L, 1L, 1L, 1L), buttock = c(0L,
0L, 0L, 0L, 1L, 0L), trunk = c(1L, 1L, 0L, 1L, 1L, 1L), up.leg = c(1L,
0L, 1L, 0L, 1L, 0L), low.leg = c(0L, 0L, 1L, 0L, 0L, 0L), resp.tract = c(0L,
0L, 0L, 0L, 0L, 0L), type = c(2L, 4L, 2L, 2L, 2L, 4L), ex.time = c(12L,
9L, 13L, 11L, 28L, 11L), excision = c(0L, 0L, 0L, 1L, 1L, 0L),
antib.time = c(12L, 9L, 13L, 29L, 31L, 11L), antibiotic = c(0L,
0L, 0L, 0L, 0L, 0L), inf.time = c(12L, 9L, 7L, 29L, 4L, 8L
), infection = c(0L, 0L, 1L, 0L, 1L, 1L), Surface_discr = structure(c(1L,
1L, 1L, 1L, 2L, 1L), .Label = c("1", "2"), class = "factor"),
ex.time_discr = c(1L, 1L, 1L, 1L, 2L, 1L), antib.time_discr = c(1L,
1L, 1L, 2L, 2L, 1L)), .Names = c("Treatment", "Gender", "Race",
"Surface", "head", "buttock", "trunk", "up.leg", "low.leg", "resp.tract",
"type", "ex.time", "excision", "antib.time", "antibiotic", "inf.time",
"infection", "Surface_discr", "ex.time_discr", "antib.time_discr"
), row.names = c(NA, 6L), class = "data.frame")
I have two dataframes, df1 (2000 rows) and df2 (100,000 rows). And I want to:
Read lat/long combination from df1
Match with lat/long from df2
Read the corresponding "m" value of df2
Add it to df1
How can I go about it?
df1
lat long val
33.29083 -109.19556 12.5000
32.96583 -109.30972 11.5000
33.04944 -109.29528 1.7500
33.06444 -109.44167 1.7500
33.61944 -110.92083 0.2500
33.98000 -111.30278 3.7500
33.79806 -110.49917 1.00008
38.37972 -119.44917 3.2500
And
df2<-structure(list(lat = c(33.29, 32.96, 48.15, 48.1, 48.18, 48.14
), lon = c(-109.19, -109.31, -124.69, -124.69, -124.68, -124.68
), m = c(0.4713, 0.8998, 0.4891, 0.8418, 0.7998, 0.5292), flagar = c(0L,
1L, 0L, 1L, 1L, 0L), flagk = c(0L, 0L, 0L, 0L, 0L, 0L), flagsi = c(0L,
0L, 0L, 0L, 0L, 0L), flags2o = c(0L, 0L, 0L, 0L, 0L, 0L), flagap = c(0L,
0L, 0L, 0L, 0L, 0L), flagmt = c(0L, 0L, 0L, 0L, 0L, 0L), flagcn = c(0L,
0L, 0L, 0L, 0L, 0L), flagkf = c(1L, 1L, 1L, 1L, 1L, 1L), flagrd = c(1L,
1L, 1L, 1L, 1L, 1L), flagrv = c(2L, 2L, 2L, 2L, 2L, 2L), flagpt = c(1L,
1L, 1L, 1L, 1L, 1L)), .Names = c("lat", "lon", "m", "flagar",
"flagk", "flagsi", "flags2o", "flagap", "flagmt", "flagcn", "flagkf",
"flagrd", "flagrv", "flagpt"), row.names = c(NA, 6L), class = "data.frame")
Using the data.table package you could do it as follows:
library(data.table)
# convert the dataframes to datatables (which are an extended form of a dataframe)
# and round the lat & lon variables to one decimal
setDT(df1)[, `:=` (lat = round(lat,1), lon = round(lon,1))]
setDT(df2)[, `:=` (lat = round(lat,1), lon = round(lon,1))]
# joining the m column of df2 to df1 based on exact matches of the rounded lat/lon values
# m = i.m transfers the matching values to the df1 dataframe
df1[df2, m := i.m, on=c("lat","lon")]
this gives:
> df1
lat lon val m
1: 33.3 -109.2 12.50000 0.4713
2: 33.0 -109.3 11.50000 0.8998
3: 33.0 -109.3 1.75000 0.8998
4: 33.1 -109.4 1.75000 NA
5: 33.6 -110.9 0.25000 NA
6: 34.0 -111.3 3.75000 NA
7: 33.8 -110.5 1.00008 NA
8: 38.4 -119.4 3.25000 NA
If you want to match on shortest distance, see this answer for an example.