R: how to sum columns grouped by a factor? - r

If I have a table like this:
user,v1,v2,v3
a,1,0,0
a,1,0,1
b,1,0,0
b,2,0,3
c,1,1,1
How to I turn it into this?
user,v1,v2,v3
a,2,0,1
b,3,0,3
c,1,1,1

In base R,
D <- matrix(c(1, 0, 0,
1, 0, 1,
1, 0, 0,
2, 0, 3,
1, 1, 1),
ncol=3, byrow=TRUE, dimnames=list(1:5, c("v1", "v2", "v3")))
D <- data.frame(user=c("a", "a", "b", "b", "c"), D)
aggregate(. ~ user, D, sum)
Returns
> aggregate(. ~ user, D, sum)
user v1 v2 v3
1 a 2 0 1
2 b 3 0 3
3 c 1 1 1

You can use dplyr for this:
library(dplyr)
df = data.frame(
user = c("a", "a", "b", "b", "c"),
v1 = c(1, 1, 1, 2, 1),
v2 = c(0, 0, 0, 0, 1),
v3 = c(0, 1, 0, 3, 1))
group_by(df, user) %>%
summarize(v1_sum = sum(v1),
v2_sum = sum(v2),
v3_sum = sum(v3))
If you're not familiar with the %>% notation, it is basically like piping from bash. It takes the output from group_by() and puts it into summarize(). The same thing would be accomplished this way:
by_user = group_by(df, user)
df_summarized = summarize(by_user,
v1_sum = sum(v1),
v2_sum = sum(v2),
v3_sum = sum(v3))

Related

find start and end idx of a time series by group in a data table

I have data table that looks like this:
data <- data.table(time = c(0, 1, 2, 3, 4, 5, 6, 7),
anom = c(0, 0, 1, 1, 1, 0, 0, 0),
gier = c(0, 0, 4, 9, 7, 0, 0, 0))
Now I am calculating some statistical values of the column gier grouped by column anom like this:
cols <- c("gier")
statFun <- function(x) list(mean = mean(x), median = median(x), std = sd(x))
statSum <- data[, unlist(lapply(.SD, statFun), recursive = FALSE), .SDcols = cols, by = anom]
This is fine but I want to go a step further and put in the start and end points of time depending on the start and of the anom groups (0 and 1). So in the end I have something like a new time series but only with the start and end points of time. So in the end the result should look like this:
res <- data.table(x.start = c(0, 2, 5),
x.end = c(1, 4, 7),
anom = c(0, 1, 0),
gier.mean = c(0, 6.666, 0),
gier.median = c(0, 7, 0),
gier.std = c(0, 2.516, 0))
How is it possible to achieve this?
addition: is there a way to achieve the result for multiple columns and not only one column like gier? For example I am able to do this but I don't know how to extend it with the mentioned columns. This way there is at least an extra column rn for the column names I calculate the statistical values.
res <- data[, setDT(do.call(rbind.data.frame, lapply(.SD, statFun)), keep.rownames = TRUE), .SDcols = cols, by = anom]
You can include additional calculation outside lapply :
library(data.table)
data[, unlist(c(lapply(.SD, statFun),
anom = first(anom), x.start = first(time), x.end = last(time)),
recursive = FALSE), rleid(anom), .SDcols = cols]
# rleid gier.mean gier.median gier.std anom x.start x.end
#1: 1 0.000000 0 0.000000 0 0 1
#2: 2 6.666667 7 2.516611 1 2 4
#3: 3 0.000000 0 0.000000 0 5 7
In dplyr we can do this similarly :
library(dplyr)
data %>%
group_by(grp = rleid(anom)) %>%
summarise(across(cols, list(mean = mean, median = median, std = sd)),
x.start = first(time),
x.end = last(time))

Building sequence data for a recommender system- replacing cross-tabular matrix with a variable value

I am trying to build a sequence data for a recommender system. I have built a cross-tabular data (Table 1) and Table 2 as shown below:
enter image description here
I have been trying to replace all the 1's in Table 1 by the "Grade" from the Table 2 in R.
Any insight/suggestion is greatly appreciated.
Instead of replacing the first one with second, the second table and directly changed to 'wide' with dcast
library(reshape2)
res <- dcast(df2, St.No. ~ Courses, value.var = 'Grade')[names(df1)]
res
# St.No. Math Phys Chem CS
#1 1 A B
#2 2 B B
#3 3 A A C
#4 4 B B D
If we need to replace the blanks with 0
res[res =='"] <- "0"
data
df1 <- data.frame(St.No. = 1:4, Math = c(0, 0, 1, 1), Phys = c(1, 1, 0, 1),
Chem = c(0, 1, 1, 0), CS = c(1, 0, 1, 1))
df2 <- data.frame(St.No. = rep(1:4, each = 4), Courses = rep(c("Math",
"Phys", "Chem", "CS"), 4),
Grade = c("", "A", "", "B", "", "B", "B", "",
"A", "", "A", "C", "B", "B", "", "D"),
stringsAsFactors = FALSE)

Coding help in R - Subset and colSum is the topic [duplicate]

If I have a table like this:
user,v1,v2,v3
a,1,0,0
a,1,0,1
b,1,0,0
b,2,0,3
c,1,1,1
How to I turn it into this?
user,v1,v2,v3
a,2,0,1
b,3,0,3
c,1,1,1
In base R,
D <- matrix(c(1, 0, 0,
1, 0, 1,
1, 0, 0,
2, 0, 3,
1, 1, 1),
ncol=3, byrow=TRUE, dimnames=list(1:5, c("v1", "v2", "v3")))
D <- data.frame(user=c("a", "a", "b", "b", "c"), D)
aggregate(. ~ user, D, sum)
Returns
> aggregate(. ~ user, D, sum)
user v1 v2 v3
1 a 2 0 1
2 b 3 0 3
3 c 1 1 1
You can use dplyr for this:
library(dplyr)
df = data.frame(
user = c("a", "a", "b", "b", "c"),
v1 = c(1, 1, 1, 2, 1),
v2 = c(0, 0, 0, 0, 1),
v3 = c(0, 1, 0, 3, 1))
group_by(df, user) %>%
summarize(v1_sum = sum(v1),
v2_sum = sum(v2),
v3_sum = sum(v3))
If you're not familiar with the %>% notation, it is basically like piping from bash. It takes the output from group_by() and puts it into summarize(). The same thing would be accomplished this way:
by_user = group_by(df, user)
df_summarized = summarize(by_user,
v1_sum = sum(v1),
v2_sum = sum(v2),
v3_sum = sum(v3))

colSums and group by [duplicate]

If I have a table like this:
user,v1,v2,v3
a,1,0,0
a,1,0,1
b,1,0,0
b,2,0,3
c,1,1,1
How to I turn it into this?
user,v1,v2,v3
a,2,0,1
b,3,0,3
c,1,1,1
In base R,
D <- matrix(c(1, 0, 0,
1, 0, 1,
1, 0, 0,
2, 0, 3,
1, 1, 1),
ncol=3, byrow=TRUE, dimnames=list(1:5, c("v1", "v2", "v3")))
D <- data.frame(user=c("a", "a", "b", "b", "c"), D)
aggregate(. ~ user, D, sum)
Returns
> aggregate(. ~ user, D, sum)
user v1 v2 v3
1 a 2 0 1
2 b 3 0 3
3 c 1 1 1
You can use dplyr for this:
library(dplyr)
df = data.frame(
user = c("a", "a", "b", "b", "c"),
v1 = c(1, 1, 1, 2, 1),
v2 = c(0, 0, 0, 0, 1),
v3 = c(0, 1, 0, 3, 1))
group_by(df, user) %>%
summarize(v1_sum = sum(v1),
v2_sum = sum(v2),
v3_sum = sum(v3))
If you're not familiar with the %>% notation, it is basically like piping from bash. It takes the output from group_by() and puts it into summarize(). The same thing would be accomplished this way:
by_user = group_by(df, user)
df_summarized = summarize(by_user,
v1_sum = sum(v1),
v2_sum = sum(v2),
v3_sum = sum(v3))

How to reorder rows in a matrix

I have a matrix and would like to reorder the rows so that for example row 5 can be switched to row 2 and row 2 say to row 7. I have a list with all rownames delimited with \n and I thought I could somehow read it into R (its a txt file) and then just use the name of the matrix (in my case 'k' and do something like k[txt file,]-> k_new but this does not work since the identifiers are not the first column but are defined as rownames.
k[ c(1,5,3,4,7,6,2), ] #But probably not what you meant....
Or perhaps (if your 'k' object rownames are something other than the default character-numeric sequence):
k[ char_vec , ] # where char_vec will get matched to the row names.
(dat <- structure(list(person = c(1, 1, 1, 1, 2, 2, 2, 2), time = c(1,
2, 3, 4, 1, 2, 3, 4), income = c(100, 120, 150, 200, 90, 100,
120, 150), disruption = c(0, 0, 0, 1, 0, 1, 1, 0)), .Names = c("person",
"time", "income", "disruption"), row.names = c("h", "g", "f",
"e", "d", "c", "b", "a"), class = "data.frame"))
dat[ c('h', 'f', 'd', 'b') , ]
#-------------
person time income disruption
h 1 1 100 0
f 1 3 150 0
d 2 1 90 0
b 2 3 120 1

Resources