I am working with the R programming language.
I have the following dataset ("my_data"):
structure(list(idd = 1:50, group_1 = c("B", "B", "A", "B", "B",
"A", "A", "A", "B", "A", "A", "B", "B", "B", "A", "A", "A", "A",
"B", "B", "A", "B", "A", "B", "A", "B", "B", "A", "B", "B", "B",
"A", "B", "A", "B", "B", "A", "A", "A", "A", "A", "B", "B", "B",
"A", "B", "B", "B", "B", "B"), v1 = c(15.7296737049317, -4.33377704672207,
-0.551850185265, 2.66888122578048, 12.109072642513, 0.0107927293899017,
20.7785032320562, -1.98974382507874, 12.1663703518471, 11.4308702978893,
-0.657500910529805, 5.71376589298221, 3.43820523228653, 19.5939432685761,
25.5605263610222, -0.407964337882465, 19.3057240854025, 9.24554068987809,
-9.6719534905096, 2.44096357354807, 14.6114916050676, 11.4510663104787,
-14.4231132108142, 15.8031868545157, 16.5505199848675, 6.95491162740581,
2.92431767382703, 29.7157201447823, 9.10001319352251, 9.85982748068076,
-1.23456937110154, -3.44130123376206, -5.23155771062088, 5.78031789617826,
23.6092446408098, 27.5379484533487, 25.6836473435279, 22.9675556994775,
7.62403748556388, -2.24150135680706, 6.72187319859928, -14.1245027627225,
6.8620712655661, 26.5987870464572, 11.3095310060752, 20.9588868268958,
14.8934095694391, 2.21089704551347, 27.4355935292935, 9.21612714668934
), group_2 = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L)), row.names = c(NA, -50L), class = "data.frame")
head(my_data)
idd group_1 v1 group_2
1 1 B 15.72967370 1
2 2 B -4.33377705 2
3 3 A -0.55185019 3
4 4 B 2.66888123 4
5 5 B 12.10907264 5
6 6 A 0.01079273 6
7 7 A 20.77850323 7
8 8 A -1.98974383 8
9 9 B 12.16637035 9
10 10 A 11.43087030 10
11 11 A -0.65750091 1
12 12 B 5.71376589 2
For this dataset, I want to perform the following steps in "dplyr":
For each grouping of 10 rows, find the sum of "v1" for group_1 = "A" and group_2 = "B"
For each of these groupings, create a new variable ("v2") that is : "A" if sum(group_1 = A) > sum(group_1 = B), "B" if sum(group_1 = A) < sum(group_1 = B) or "0" if sum(group_1 = A) = sum(group_1 = B)
I know how to do this manually in R:
#STEP 1: since my_data has 50 rows, break my_data into 5 groups of 10 rows
rows_1 = my_data[1:10,]
rows_2 = my_data[11:20,]
rows_3 = my_data[21:30,]
rows_4 = my_data[31:40,]
rows_5 = my_data[41:50,]
# STEP 2: find out values of "v2"
library(dplyr)
dplyr_row_1 = data.frame(rows_1 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_1$v2 = ifelse(dplyr_row_1[1,2] > dplyr_row_1[2,2], "A", ifelse(dplyr_row_1[1,2] < dplyr_row_1[2,2], "B", 0))
dplyr_row_2 = data.frame(rows_2 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_2$v2 = ifelse(dplyr_row_2[1,2] > dplyr_row_2[2,2], "A", ifelse(dplyr_row_2[1,2] < dplyr_row_2[2,2], "B", 0))
dplyr_row_3 = data.frame(rows_3 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_3$v2 = ifelse(dplyr_row_3[1,2] > dplyr_row_3[2,2], "A", ifelse(dplyr_row_3[1,2] < dplyr_row_3[2,2], "B", 0))
dplyr_row_4 = data.frame(rows_4 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_4$v2 = ifelse(dplyr_row_4[1,2] > dplyr_row_4[2,2], "A", ifelse(dplyr_row_4[1,2] < dplyr_row_4[2,2], "B", 0))
dplyr_row_5 = data.frame(rows_5 %>% group_by(group_1) %>% summarize(sum = sum(v1)))
dplyr_row_5$v2 = ifelse(dplyr_row_5[1,2] > dplyr_row_5[2,2], "A", ifelse(dplyr_row_5[1,2] < dplyr_row_5[2,2], "B", 0))
# STEP 3: append "v2" to first 5 files:
rows_1$v2 = dplyr_row_1$v2
rows_2$v2 = dplyr_row_2$v2
rows_3$v2 = dplyr_row_3$v2
rows_4$v2 = dplyr_row_4$v2
rows_5$v2 = dplyr_row_5$v2
# STEP 4: create final file:
final_file = rbind(rows_1,rows_2, rows_3, rows_4, rows_5)
As a result, the final file looks something like this:
idd group_1 v1 group_2 v2
1 1 B 15.72967370 1 B
2 2 B -4.33377705 2 B
3 3 A -0.55185019 3 B
4 4 B 2.66888123 4 B
5 5 B 12.10907264 5 B
6 6 A 0.01079273 6 B
7 7 A 20.77850323 7 B
8 8 A -1.98974383 8 B
9 9 B 12.16637035 9 B
10 10 A 11.43087030 10 B
11 11 A -0.65750091 1 A
My Question: Can someone please show me how to perform Steps 1 to Step 4 in a single "dplyr" command?
Thanks!
Here is alternative method.
library(tidyverse)
df %>%
mutate(group_index = rep(1:(n() /10), each = 10)) %>%
group_by(group_index) %>%
mutate(
v2 = case_when(
sum(v1[group_1 == 'A']) > sum(v1[group_1 == 'B']) ~ 'A',
sum(v1[group_1 == 'A']) < sum(v1[group_1 == 'B']) ~ 'B',
TRUE ~'0')
)
First I'll create a group_index to group every 10 rows together.
Then group_by the relevant columns and calculate sum.
Remove the grouping layer of group_1, since we need to compare the values in A and B.
If the unique length of sum is equal to "1", that means they are the same, then input "0" in column v2. If they are not the same, output the maximum category stored in group_1.
Finally remove the sum column and sort by idd.
This method is able to solve problem with more than two groups in group_1.
The first 20 rows are shown here for example.
library(tidyverse)
df %>%
mutate(group_index = rep(1:(nrow(df)/10), each = 10)) %>%
group_by(group_index, group_1) %>%
mutate(sum = sum(v1)) %>%
group_by(group_index) %>%
mutate(v2 = ifelse(length(unique(sum)) == 1, 0, group_1[which.max(sum)])) %>%
ungroup() %>%
select(-c(sum, group_index))
# A tibble: 20 x 5
idd group_1 v1 group_2 v2
<int> <chr> <dbl> <int> <chr>
1 1 B 15.7 1 B
2 2 B -4.33 2 B
3 3 A -0.552 3 B
4 4 B 2.67 4 B
5 5 B 12.1 5 B
6 6 A 0.0108 6 B
7 7 A 20.8 7 B
8 8 A -1.99 8 B
9 9 B 12.2 9 B
10 10 A 11.4 10 B
11 11 A -0.658 1 A
12 12 B 5.71 2 A
13 13 B 3.44 3 A
14 14 B 19.6 4 A
15 15 A 25.6 5 A
16 16 A -0.408 6 A
17 17 A 19.3 7 A
18 18 A 9.25 8 A
19 19 B -9.67 9 A
20 20 B 2.44 10 A
Related
Below I have a DF.
A B C D
a 4 2 2
g 5 2 2
d 7 65 7
e 3 6 7
I would like to make this DF so that column A has "g" in the first row, and "d" in the second row. I would like to do this by calling the value in column A (rather than an index). How can I do this?
Ideal output
A B C D
g 5 2 2
d 7 65 7
a 4 2 2
e 3 6 7
We may convert to factor with levels specified in an order before arrangeing
library(forcats)
library(dplyr)
DF %>%
arrange(fct_relevel(A, 'g', 'd'))
A B C D
1 g 5 2 2
2 d 7 65 7
3 a 4 2 2
4 e 3 6 7
with fct_relevel, we can specify the order of specific levels without specifying the rest of the levels
> with(DF, fct_relevel(A, 'g', 'd'))
[1] a g d e
Levels: g d a e
data
DF <- structure(list(A = c("a", "g", "d", "e"), B = c(4L, 5L, 7L, 3L
), C = c(2L, 2L, 65L, 6L), D = c(2L, 2L, 7L, 7L)), class = "data.frame",
row.names = c(NA,
-4L))
Another possible solution:
library(dplyr)
df <- data.frame(
stringsAsFactors = FALSE,
A = c("a", "g", "d", "e"),
B = c(4L, 5L, 7L, 3L),
C = c(2L, 2L, 65L, 6L),
D = c(2L, 2L, 7L, 7L)
)
df %>% arrange(match(A, c("g", "d", setdiff(c("g", "d"), A))))
#> A B C D
#> 1 g 5 2 2
#> 2 d 7 65 7
#> 3 a 4 2 2
#> 4 e 3 6 7
Try the code below
with(
df,
df[match(c("g","d",A[!A%in%c("g","d")]),A),]
)
and you will see
A B C D
2 g 5 2 2
3 d 7 65 7
1 a 4 2 2
4 e 3 6 7
Just to add a base R solution if you are not interested in external packages, you can specify the row order directly:
# Sample Data
DF <- structure(list(A = c("a", "g", "d", "e"), B = c(4L, 5L, 7L, 3L
), C = c(2L, 2L, 65L, 6L), D = c(2L, 2L, 7L, 7L)), class = "data.frame",
row.names = c(NA, -4L))
A hard code for this example:
DF2 <- DF[c(2,3,1,4),]
A more generalizable example:
# specify desired rows
rownums <- which(DF$A %in% c("g","d"), arr.ind = TRUE)
# Specify other rows
otherrows <- seq(1:nrow(DF))[!(seq(1:nrow(DF)) %in% rownums)]
# Organize
DF2 <- DF[c(rownums,otherrows),]
I have a complete data frame of all cities from Brazil. I want just some predefined cities. I have a column with these predefined cities. Then I'd like to use all the columns from my data frame, but select only the lines which coincides the cities of column with all cities and the column with predefined cities.
data = read.csv(file="C:/Users/guilherme/Desktop/data.csv", header=TRUE, sep=";")
data
> AllCities Year1990 Year200 PredefinedCities CharacCities1 CharacCities2
1 A 2 4 C 12 5
2 B 2 2 A 11 10
3 C 3 4 F 09 2
4 D 4 2
5 E 5 6
6 F 6 2
I want the following
> data
AllCities Year1990 Year200 PredefinedCities CharacCities1 CharacCities2
1 C 3 4 C 12 5
2 A 2 4 A 11 10
3 F 6 2 F 09 2
You need merge -
merge(
data[, c("AllCities", "Year1990", "Year200")],
data[, c("PredefinedCities", "CharacCities1", "CharacCities2")],
by.x = "AllCities", by.y = "PredefinedCities"
)
AllCities Year1990 Year200 CharacCities1 CharacCities2
1 A 2 4 11 10
2 C 3 4 12 5
3 F 6 2 9 2
Note - Your data format is unusual. If you can, you should fix data source so that it gives you AllCities and PreferredCities tables separately or maybe even join them correctly before creating the csv file.
Data -
structure(list(AllCities = c("A", "B", "C", "D", "E", "F"), Year1990 = c(2L,
2L, 3L, 4L, 5L, 6L), Year200 = c(4L, 2L, 4L, 2L, 6L, 2L), PredefinedCities = c("C",
"A", "F", "", "", ""), CharacCities1 = c(12L, 11L, 9L, NA, NA,
NA), CharacCities2 = c(5L, 10L, 2L, NA, NA, NA)), .Names = c("AllCities",
"Year1990", "Year200", "PredefinedCities", "CharacCities1", "CharacCities2"
), class = "data.frame", row.names = c(NA, -6L))
data <- data[data$AllCities %in% data$PredefinedCities,]
I have a data of 3 variables A, B, C. I need to group by on A and need a minimum value of B when C is non-zero.
> data
A B C
1 a 3 0
2 a 6 1
3 a 9 2
4 a 12 2
5 b 3 0
6 b 6 0
7 b 9 0
8 b 12 4
Expected Output:
> output
1 2
1 a 6
2 b 12
I tried doing this which was running for more than 2 hours:
rbind(by(data, data$A, function(x) min(x$B[x$C>0])))
We group by 'A', get the min of 'B' where 'C' is not 0
library(dplyr)
df1 %>%
group_by(A) %>%
summarise(B = min(B[C > 0]))
# A tibble: 2 x 2
# A B
# <chr> <int>
#1 a 6
#2 b 12
Or a faster option would be to filter first, then do the group_by
df1 %>%
filter(C > 0) %>%
group_by(A) %>%
summarise(B = min(B))
Or with data.table
library(data.table)
setDT(df1)[,.(B = min(B[C > 0])) , A]
data
df1 <- structure(list(A = c("a", "a", "a", "a", "b", "b", "b", "b"),
B = c(3L, 6L, 9L, 12L, 3L, 6L, 9L, 12L), C = c(0L, 1L, 2L,
2L, 0L, 0L, 0L, 4L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8"))
The Result field should contain (Sum of Mod values for a STATE where category not equal to 0). Remaining values should be 0.
I am trying to achieve the same using for loop but not getting the desired result. Appreciate your help in getting this resolved.
Here is some SampleData
State Category Mod Result
1 A 0 5 45
2 A 1 10 0
3 A 2 15 0
4 A 3 10 0
5 A 4 10 0
6 B 0 11 60
7 B 1 12 0
8 B 2 14 0
9 B 3 16 0
10 B 4 18 0
This is what I am trying via for loop:
for(i in SampleData) {
SampleData$Result[i] <- ifelse(SampleData$Category[i] == "0",
sum(SampleData$Mod[i:i+5]),
0)
}
You could try
transform(df, Result = ifelse(Category == 0,
ave(Mod, State, FUN = function(x) sum(x) - x[Category == 0]),
0))
# State Category Mod Result
#1 A 0 5 45
#2 A 1 10 0
#3 A 2 15 0
#4 A 3 10 0
#5 A 4 10 0
#6 B 0 11 60
#7 B 1 12 0
#8 B 2 14 0
#9 B 3 16 0
#10 B 4 18 0
ave(Mod, State, FUN = function(x) sum(x) - x[Category == 0]) calculates the sum of Mod for each value of State and subtracts the respective value of Mod where Category == 0 is TRUE.
data
df <- structure(list(State = c("A", "A", "A", "A", "A", "B", "B", "B",
"B", "B"), Category = c(0L, 1L, 2L, 3L, 4L, 0L, 1L, 2L, 3L, 4L
), Mod = c(5L, 10L, 15L, 10L, 10L, 11L, 12L, 14L, 16L, 18L)), .Names = c("State",
"Category", "Mod"), row.names = c("1", "2", "3", "4", "5", "6",
"7", "8", "9", "10"), class = "data.frame")
A dplyr version, if you like:
library(dplyr)
SampleData <- data_frame(State = c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B"),
Category = c(0L, 1L, 2L, 3L, 4L, 0L, 1L, 2L, 3L, 4L),
Mod = c(5L, 10L, 15L, 10L, 10L, 11L, 12L, 14L, 16L, 18L))
SampleData %>%
group_by(State) %>%
mutate(Result = ifelse(Category == 0, sum(Mod[-1]), 0))
#> # A tibble: 10 x 4
#> # Groups: State [2]
#> State Category Mod Result
#> <chr> <int> <int> <dbl>
#> 1 A 0 5 45
#> 2 A 1 10 0
#> 3 A 2 15 0
#> 4 A 3 10 0
#> 5 A 4 10 0
#> 6 B 0 11 60
#> 7 B 1 12 0
#> 8 B 2 14 0
#> 9 B 3 16 0
#> 10 B 4 18 0
I've 2 different data.tables. I need to merge and get max value based on a row values. The examples of two tables are given as Input below and expected output shown below.
Input
Table 1
X A B
A 3
B 4 6
C 5
D 9 12
Table 2
X A B
A 1 5
B 6 8
C 7 14
D 5
E 1 1
F 2 3
G 5 6
Expected Output:
X A B
A 3 5
B 6 8
C 7 14
D 9 12
E 1 1
F 2 3
G 5 6
We can rbind the two datasets and do a group by max
library(data.table)
rbindlist(list(tbl1, tbl2))[, lapply(.SD, max, na.rm = TRUE), X]
# X A B
#1: A 3 5
#2: B 6 8
#3: C 7 14
#4: D 9 12
#5: E 1 1
#6: F 2 3
#7: G 5 6
If we are using base R, then use aggregate after rbinding the datasets
aggregate(.~ X, rbind(tbl1, tbl2), max, na.rm = TRUE, na.action = NULL)
NOTE: Assume that the 'A', 'B' columns are numeric and blanks are NA
data
tbl1 <- structure(list(X = c("A", "B", "C", "D"), A = c(3L, 4L, 5L, 9L
), B = c(NA, 6L, NA, 12L)), .Names = c("X", "A", "B"), class = "data.frame",
row.names = c(NA, -4L))
tbl2 <- structure(list(X = c("A", "B", "C", "D", "E", "F", "G"), A = c(1L,
6L, 7L, 5L, 1L, 2L, 5L), B = c(5L, 8L, 14L, NA, 1L, 3L, 6L)), .Names = c("X",
"A", "B"), class = "data.frame",
row.names = c(NA, -7L))