Delete rows if column values are equal - r

I want delete the rows if the columns (YEAR, POL, CTY, ID, AMOUNT) are equal in the values across all rows. Please see the output table below.
Table:
YEAR POL CTY ID AMOUNT RAN LEGAL
2017 30408 11 36 3500 RANGE1 L0015N20W23
2017 30408 11 36 3500 RANGE1 L00210N20W24
2017 30408 11 36 3500 RANGE1 L00310N20W25
2017 30409 11 36 3500 RANGE1 L0015N20W23
2017 30409 11 35 3500 RANGE2 NANANA
2017 30409 11 35 3500 RANGE3 NANANA
2017 30409 11 35 3500 RANGE3 NANANA
Output:
YEAR POL CTY ID AMOUNT RAN LEGAL
2017 30408 11 35 3500 RANGE1 L0015N20W23

You can try this:
no_duplicate_cols <- c("YEAR", "POL", "CTY", "ID", "AMOUNT")
new_df <- df[!duplicated(df[, no_duplicate_cols]), ]
The data frame new_df will hold the rows from df that are not duplicated.

If I understood the question correctly then I think you can try this
library(dplyr)
df %>%
group_by(YEAR, POL, CTY, ID, AMOUNT) %>%
filter(n() == 1)
Output (but it seems that the output provided in the original question has bit of typo!):
# A tibble: 1 x 7
# Groups: YEAR, POL, CTY, ID, AMOUNT [1]
YEAR POL CTY ID AMOUNT RAN LEGAL
1 2017 30409 11 36 3500 RANGE1 L0015N20W23
#sample data
> dput(df)
structure(list(YEAR = c(2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L), POL = c(30408L, 30408L, 30408L, 30409L, 30409L, 30409L,
30409L), CTY = c(11L, 11L, 11L, 11L, 11L, 11L, 11L), ID = c(36L,
36L, 36L, 36L, 35L, 35L, 35L), AMOUNT = c(3500L, 3500L, 3500L,
3500L, 3500L, 3500L, 3500L), RAN = structure(c(1L, 1L, 1L, 1L,
2L, 3L, 3L), .Label = c("RANGE1", "RANGE2", "RANGE3"), class = "factor"),
LEGAL = structure(c(1L, 2L, 3L, 1L, 4L, 4L, 4L), .Label = c("L0015N20W23",
"L00210N20W24", "L00310N20W25", "NANANA"), class = "factor")), .Names = c("YEAR",
"POL", "CTY", "ID", "AMOUNT", "RAN", "LEGAL"), class = "data.frame", row.names = c(NA,
-7L))

Related

subtracting column from the sum by group in R

Here part of mydataset
df=structure(list(CustomerName = structure(c(1L, 1L, 1L, 2L, 2L,
2L), .Label = c("x", "y"), class = "factor"), ItemRelation = c(11202L,
11202L, 11202L, 1L, 1L, 1L), SaleCount = c(214L, 88L, 42L, 214L,
88L, 42L), DocumentNum = c(137L, 137L, 137L, 3L, 3L, 3L), DocumentYear = c(2018L,
2018L, 2018L, 2018L, 2018L, 2018L), k = c(114.66667, 114.66667,
114.66667, 114.66667, 114.66667, 114.66667), m0 = c(31.92, 31.92,
31.92, 31.92, 31.92, 31.92), Action_Effect = c(82.74667, 82.74667,
82.74667, 82.74667, 82.74667, 82.74667)), .Names = c("CustomerName",
"ItemRelation", "SaleCount", "DocumentNum", "DocumentYear", "k",
"m0", "Action_Effect"), class = "data.frame", row.names = c(NA,
-6L))
i need for each group CustomerName+ItemRelation+DocumentNum+DocumentYear
calculate the sum for salecount and then from this sum substract Action_Effect column.
I.E. output must be
df2=structure(list(CustomerName = structure(c(1L, 1L, 1L, 2L, 2L,
2L), .Label = c("x", "y"), class = "factor"), ItemRelation = c(11202L,
11202L, 11202L, 1L, 1L, 1L), SaleCount = c(214L, 88L, 42L, 214L,
88L, 42L), DocumentNum = c(137L, 137L, 137L, 3L, 3L, 3L), DocumentYear = c(2018L,
2018L, 2018L, 2018L, 2018L, 2018L), X. = c(114.66667, 114.66667,
114.66667, 114.66667, 114.66667, 114.66667), m0 = c(31.92, 31.92,
31.92, 31.92, 31.92, 31.92), Action_Effect = c(82.74667, 82.74667,
82.74667, 82.74667, 82.74667, 82.74667), sum = c(344L, 344L,
344L, 344L, 344L, 344L), output = c(261.25333, 261.25333, 261.25333,
261.25333, 261.25333, 261.25333)), .Names = c("CustomerName",
"ItemRelation", "SaleCount", "DocumentNum", "DocumentYear", "X.",
"m0", "Action_Effect", "sum", "output"), class = "data.frame", row.names = c(NA,
-6L))
the long table, so i decided show desired output via dput()
How can i do it?
Your data is a bit weird, as the values are the same for both groups, but this should work:
libary(dplyr)
df %>%
group_by(CustomerName, ItemRelation, DocumentNum, DocumentYear) %>%
mutate(test = sum(SaleCount) - Action_Effect)
# A tibble: 6 x 9
# Groups: CustomerName, ItemRelation, DocumentNum, DocumentYear [2]
CustomerName ItemRelation SaleCount DocumentNum DocumentYear k m0 Action_Effect test
<fctr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 x 11202 214 137 2018 114.6667 31.92 82.74667 261.2533
2 x 11202 88 137 2018 114.6667 31.92 82.74667 261.2533
3 x 11202 42 137 2018 114.6667 31.92 82.74667 261.2533
4 y 1 214 3 2018 114.6667 31.92 82.74667 261.2533
5 y 1 88 3 2018 114.6667 31.92 82.74667 261.2533
6 y 1 42 3 2018 114.6667 31.92 82.74667 261.2533
To add the sum, use
df %>%
group_by(CustomerName, ItemRelation, DocumentNum, DocumentYear) %>%
mutate(sum = sum(SaleCount), output = sum(SaleCount) - Action_Effect)
For completeness, adding base and data.table syntax:
base:
df$test <- unlist(by(df,
paste(df$CustomerName, df$ItemRelation, df$DocumentNum, df$DocumentYear),
function(x) sum(x$SaleCount) - x$Action_Effect))
df
data.table:
library(data.table)
setDT(df)
df[, test2:=sum(SaleCount) - Action_Effect,
by=.(CustomerName, ItemRelation, DocumentNum, DocumentYear)][]

Calculate date diff using same date filed column

I want to find the total sum of running minutes of a battery per month and year. For this I have the following condition:
If Battery.voltage < 50 then "Yes, otherwise "No.
Note: For calculating the total sum of mins, we can the time stamp column which is day, month, year, hour, mins.
This is my data:
# Time.stamp Battery.voltage Condition
# 1 01/04/2016 00:00 51 No
# 2 01/04/2016 00:01 52 No
# 3 01/04/2016 00:02 45 Yes
# 4 01/04/2016 00:03 48 Yes
# 5 01/04/2016 00:04 49 Yes
# 6 01/04/2016 00:05 55 No
# 7 01/04/2016 00:06 54 No
# ...
structure(list(
Time.stamp = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 10L, 11L, 12L, 12L, 13L),
.Label = c("01/04/2016 00:00", "01/04/2016 00:01", "01/04/2016 00:02", "01/04/2016 00:03",
"01/04/2016 00:04", "01/04/2016 00:05", "01/04/2016 00:06", "01/04/2016 00:07",
"01/04/2016 00:08", "01/04/2016 00:09", "01/04/2016 00:11", "01/04/2016 00:12",
"01/04/2016 00:13"), class = "factor"),
Battery.voltage = c(51L, 52L, 45L, 48L, 49L, 55L, 54L, 52L, 51L, 49L, 48L, 47L, 45L, 50L, 51L),
Condition = structure(c(1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L),
.Label = c("No", "Yes"), class = "factor")),
.Names = c("Time.stamp", "Battery.voltage", "Condition"),
class = "data.frame", row.names = c(NA, -15L))
My expected output is something like this:
Month year Sum of mins running in battery
Jan 2016 350min
Feb 2016 450min
etc.
Unfortunately, your sample data is not very representative of your problem statement, as it only includes data for one day. It would have been beneficial to provide some code that generates random data for sufficient entries (i.e. dates).
That aside, you could adapt the following solution (here I assume your timestamp format is "DD/MM/YYYY"):
df %>%
mutate(
Time.stamp = as.POSIXct(Time.stamp, format = "%d/%m/%Y %H:%M"),
byday = format(Time.stamp, "%d/%m/%Y"),
bymonth = format(Time.stamp, "%d/%m"),
byyear = format(Time.stamp, "%Y")) %>%
group_by(byday) %>%
summarise(sum.running.in.mins = sum(Condition == "Yes"))
## A tibble: 1 x 2
# byday sum.running.in.mins
# <chr> <int>
#1 01/04/2016 7
Here we create columns byday, bymonth and byyear according to which you can group entries and calculate the sum of total running time per group. In above example, I calculate the total running time by day; to get the total running time per month, you would replace group_by(byday) with group_by(bymonth).

Find out the item first time shows in a data set

I have a data set ProductTable, I want to return the date of all the ProductsFamily has been ordered first time and the very last time. Examples:
ProductTable
OrderPostingYear OrderPostingMonth OrderPostingDate ProductsFamily Sales QTY
2008 1 20 R1 5234 1
2008 1 12 R2 223 2
2009 1 30 R3 34 1
2008 2 1 R1 1634 3
2010 4 23 R3 224 1
2009 3 20 R1 5234 1
2010 7 12 R2 223 2
Result as followings
OrderTime
ProductsFamily OrderStart OrderEnd SumSales
R1 2008/1/20 2009/3/20 12102
R2 2008/1/12 2010/7/12 446
R3 2009/1/30 2010/4/23 258
I have no idea how to do it. Any suggestions?
ProductTable <- structure(list(OrderPostingYear = c(2008L, 2008L, 2009L, 2008L,
2010L, 2009L, 2010L), OrderPostingMonth = c(1L, 1L, 1L, 2L, 4L,
3L, 7L), OrderPostingDate = c(20L, 12L, 30L, 1L, 23L, 20L, 12L
), ProductsFamily = structure(c(1L, 2L, 3L, 1L, 3L, 1L, 2L), .Label = c("R1",
"R2", "R3"), class = "factor"), Sales = c(5234L, 223L, 34L, 1634L,
224L, 5234L, 223L), QTY = c(1L, 2L, 1L, 3L, 1L, 1L, 2L)), .Names = c("OrderPostingYear",
"OrderPostingMonth", "OrderPostingDate", "ProductsFamily", "Sales",
"QTY"), class = "data.frame", row.names = c(NA, -7L))
We can also use dplyr/tidyr to do this. We arrange the columns, concatenate the 'Year:Date' columns with unite, group by 'ProductsFamily', get the first, last of 'Date' column and sum of 'Sales' within summarise.
library(dplyr)
library(tidyr)
ProductTable %>%
arrange(ProductsFamily, OrderPostingYear, OrderPostingMonth, OrderPostingDate) %>%
unite(Date,OrderPostingYear:OrderPostingDate, sep='/') %>%
group_by(ProductsFamily) %>%
summarise(OrderStart=first(Date), OrderEnd=last(Date), SumSales=sum(Sales))
# Source: local data frame [3 x 4]
# ProductsFamily OrderStart OrderEnd SumSales
# (fctr) (chr) (chr) (int)
# 1 R1 2008/1/20 2009/3/20 12102
# 2 R2 2008/1/12 2010/7/12 446
# 3 R3 2009/1/30 2010/4/23 258
You can first set up the date in a new column, and then aggregate your data using data.table package (you take the first and last date by ID, as well as the sum of sales):
library(data.table)
# First build up the date
ProductTable$date = with(ProductTable,
as.Date(paste(OrderPostingYear,
OrderPostingMonth,
OrderPostingDate, sep = "." ),
format = "%Y.%m.%d"))
# In a second step, aggregate your data
setDT(ProductTable)[,list(OrderStart = sort(date)[1],
OrderEnd = sort(date)[.N],
SumSales = sum(Sales))
,ProductsFamily]
# ProductsFamily OrderStart OrderEnd SumSales
#1: R1 2008-01-20 2009-03-20 12102
#2: R2 2008-01-12 2010-07-12 446
#3: R3 2009-01-30 2010-04-23 258

geom_area group and fill by different variables

I have a geom_area plot that looks like this:
The x-axis is a time serie, and i want to color the fill of each facet by groups of the variable "estacion" (seasons of the year). Here's a sample of my data:
año censo estacion tipoEuro censEu censTot pCensEu
2010 2010-01-01 Invierno HA frisona 13 32 40.62500
2010 2010-01-01 Invierno Bovinos jovenes 10 32 31.25000
2010 2010-01-02 Invierno HA frisona 13 32 40.62500
---
2014 2014-12-30 Invierno Bovinos jovenes 15 26 57.69231
2014 2014-12-31 Invierno HA frisona 3 26 11.53846
2014 2014-12-31 Invierno Terneros 8 26 30.76923
Here's the code I'm using to make the plot:
ggplot(censTot1,aes(x=censo, y=pCensEu,group=tipoEuro)) +
geom_area() +
geom_line()+ facet_grid(tipoEuro ~ .)
and this is the code i intend to use, and the error generated:
ggplot(censTot1,aes(x=censo,y=pCensEu,group=tipoEuro,fill=estacion)) +
geom_area() +
geom_line()+ facet_grid(tipoEuro ~ .)
Error: Aesthetics can not vary with a ribbon
I'm not sure about the desired output. Would this solve your problem?
library(ggplot2)
ggplot(df, aes(x=censo, y=pCensEu,group=tipoEuro))+
geom_area(aes(fill=estacion))+
geom_line()+
facet_grid(tipoEuro ~ .)
The data used
df <- structure(list(año = c(2010L, 2010L, 2010L, 2014L, 2014L, 2014L
), censo = structure(c(1L, 1L, 2L, 3L, 4L, 4L), .Label = c("01/01/2010",
"02/01/2010", "30/12/2014", "31/12/2014"), class = "factor"),
estacion = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "Invierno", class = "factor"),
tipoEuro = structure(c(2L, 1L, 2L, 1L, 2L, 3L), .Label = c("Bovinos jovenes",
"HA frisona", "Terneros"), class = "factor"), censEu = c(13L,
10L, 13L, 15L, 3L, 8L), censTot = c(32L, 32L, 32L, 26L, 26L,
26L), pCensEu = c(40.625, 31.25, 40.625, 57.69231, 11.53846,
30.76923)), .Names = c("año", "censo", "estacion", "tipoEuro",
"censEu", "censTot", "pCensEu"), class = "data.frame", row.names = c(NA,
-6L))
You can also try this:
ggplot(df, aes(x=censo, y=pCensEu, color=estacion, group=interaction(tipoEuro,estacion))) +
geom_area() +
geom_line()+ facet_grid(tipoEuro ~ .)

controlling text when using add_tooltip in ggvis - r

I am trying to get more control over the text that appears when using add_tooltip in ggvis.
Say I want to plot 'totalinns' against 'avg' for this dataframe. Color points by 'country'.
The text I want to appear in the hovering tooltip would be: 'player', 'country', 'debutyear' 'avg'
tmp:
# player totalruns totalinns totalno totalout avg debutyear country
# 1 AG Ganteaume 112 1 0 1 112.00000 1948 WI
# 2 DG Bradman 6996 80 10 70 99.94286 1928 Aus
# 3 MN Nawaz 99 2 1 1 99.00000 2002 SL
# 4 VH Stollmeyer 96 1 0 1 96.00000 1939 WI
# 5 DM Lewis 259 5 2 3 86.33333 1971 WI
# 6 Abul Hasan 165 5 3 2 82.50000 2012 Ban
# 7 RE Redmond 163 2 0 2 81.50000 1973 NZ
# 8 BA Richards 508 7 0 7 72.57143 1970 SA
# 9 H Wood 204 4 1 3 68.00000 1888 Eng
# 10 JC Buttler 200 3 0 3 66.66667 2014 Eng
I understand that I need to make a key/id variable as ggvis only takes information supplied to it. Therefore I need to refer back to the original data. I have tried changing my text inside of my paste0() command, but still can't get it right.
tmp$id <- 1:nrow(tmp)
all_values <- function(x) {
if(is.null(x)) return(NULL)
row <- tmp[tmp$id == x$id, ]
paste0(tmp$player, tmp$country, tmp$debutyear,
tmp$avg, format(row), collapse = "<br />")
}
tmp %>% ggvis(x = ~totalinns, y = ~avg, key := ~id) %>%
layer_points(fill = ~factor(country)) %>%
add_tooltip(all_values, "hover")
Find below code to reproduce example:
tmp <- structure(list(player = c("AG Ganteaume", "DG Bradman", "MN Nawaz",
"VH Stollmeyer", "DM Lewis", "Abul Hasan", "RE Redmond", "BA Richards",
"H Wood", "JC Buttler"), totalruns = c(112L, 6996L, 99L, 96L,
259L, 165L, 163L, 508L, 204L, 200L), totalinns = c(1L, 80L, 2L,
1L, 5L, 5L, 2L, 7L, 4L, 3L), totalno = c(0L, 10L, 1L, 0L, 2L,
3L, 0L, 0L, 1L, 0L), totalout = c(1L, 70L, 1L, 1L, 3L, 2L, 2L,
7L, 3L, 3L), avg = c(112, 99.9428571428571, 99, 96, 86.3333333333333,
82.5, 81.5, 72.5714285714286, 68, 66.6666666666667), debutyear = c(1948L,
1928L, 2002L, 1939L, 1971L, 2012L, 1973L, 1970L, 1888L, 2014L
), country = c("WI", "Aus", "SL", "WI", "WI", "Ban", "NZ", "SA",
"Eng", "Eng")), .Names = c("player", "totalruns", "totalinns",
"totalno", "totalout", "avg", "debutyear", "country"), class = c("tbl_df",
"data.frame"), row.names = c(NA, -10L))
I think this is closer:
all_values <- function(x) {
if(is.null(x)) return(NULL)
row <- tmp[tmp$id == x$id, ]
paste(tmp$player[x$id], tmp$country[x$id], tmp$debutyear[x$id],
tmp$avg[x$id], sep="<br>")
}

Resources