data.table join by subset NAs - r

This is a query that comes from an earlier thread I chanced upon, two tables DT1 and DT2
DT1
Country State City Start End
1 IN Telangana Hyderabad 100 200
2 IN Maharashtra Pune 300 400
3 IN Haryana Gurgaon 500 600
4 IN Maharashtra Pune 700 800
5 IN Gujarat Ahmedabad 900 1000
DT2 with 7 rows
ID No
1 157
2 346
3 389
4 453
5 562
6 9874
7 98745
When they are joined using this code,
DT2[DT1, on=.(No>Start,No<End), ]
produces this output, with 6 rows
ID No No.1 Country State City
1: 1 100 200 IN Telangana Hyderabad
2: 2 300 400 IN Maharashtra Pune
3: 3 300 400 IN Maharashtra Pune
4: 5 500 600 IN Haryana Gurgaon
5: NA 700 800 IN Maharashtra Pune
6: NA 900 1000 IN Gujarat Ahmedabad
i can understand the NAs corresponding to IDs 6 and 7 (rownumbers 5 and 6), but why is the NA corresponding to ID 4 missing.
ID4 which has 453 no, maps to no ranges in DT1 and should have thrown an NA?
EDIT1: Providing Code to create the datasets
DT1<-
structure(list(Country = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "IN", class = "factor"),
State = structure(c(4L, 3L, 2L, 3L, 1L), .Label = c("Gujarat",
"Haryana", "Maharashtra", "Telangana"), class = "factor"),
City = structure(c(3L, 4L, 2L, 4L, 1L), .Label = c("Ahmedabad",
"Gurgaon", "Hyderabad", "Pune"), class = "factor"), Start = c(100L,
300L, 500L, 700L, 900L), End = c(200L, 400L, 600L, 800L,
1000L)), .Names = c("Country", "State", "City", "Start",
"End"), class = c("data.table", "data.frame"))
DT2<-
structure(list(ID = 1:7, No = c(157L, 346L, 389L, 453L, 562L,
9874L, 98745L)), .Names = c("ID", "No"), class = c("data.table",
"data.frame"))

Related

Transforming multiple columns structure using Dplyr in R

I have a dataset, df,
State Year 0 1 2 3 4 5
Georgia 2001 10,000 200 300 400 500 800
Georgia 2002 20,000 500 500 1,000 2,000 2,500
Georgia 2003 2,000 5,000 1,000 400 300 8,000
Washington 2001 1,000 10,000 6,000 8,000 9,900 10,000
Washington 2006 5,000 300 200 900 1,000 8,000
I would like my desired output to look like this:
State Year Age Population
Georgia 2001 0 10,000
Georgia 2002 0 20,000
Georgia 2003 0 2,000
Georgia 2001 1 200
Georgia 2002 1 500
Georgia 2003 1 5000
Georgia 2001 2 300
Georgia 2002 2 500
Georgia 2003 2 1000
Georgia 2001 3 400
Georgia 2002 3 1000
Georgia 2003 3 400
Georgia 2001 4 500
Georgia 2002 4 2000
Georgia 2003 4 300
Georgia 2001 5 800
Georgia 2002 5 2500
Georgia 2003 5 8000
Washington 2001 0 1000
Washington 2006 0 5000
Washington 2001 1 10000
Washington 2006 1 300
Washington 2001 2 6000
Washington 2006 2 200
Washington 2001 3 8000
Washington 2006 3 900
Washington 2001 4 9900
Washington 2006 4 1000
Washington 2001 5 10000
Washington 2006 5 8200
Here is my dput
structure(list(state = structure(c(1L, 1L, 1L, 2L, 2L), .Label = c("georgia",
"washington"), class = "factor"), year = c(2001L, 2002L, 2003L,
2001L, 2006L), X0 = structure(c(1L, 3L, 4L, 2L, 5L), .Label = c("10,000",
"1000", "20,000", "2000", "5000"), class = "factor"), X1 = structure(c(2L,
4L, 5L, 1L, 3L), .Label = c("10,000", "200", "300", "500", "5000"
), class = "factor"), X2 = c(300L, 500L, 1000L, 6000L, 200L),
X3 = c(400L, 1000L, 400L, 8000L, 900L), X4 = c(500L, 2000L,
300L, 99000L, 1000L), X5 = structure(c(3L, 2L, 4L, 1L, 4L
), .Label = c("10,000", "2500", "800", "8000"), class = "factor")), class = "data.frame", row.names
=
c(NA,
-5L))
This is what I have tried:
I know that I must groupby the state and the year as well as perform some type of pivot by possibly utilizing the gather() function
library(tidyr)
library(dplyr)
df1 <- gather(df, 0, 1, 2, 3, 4, 5 factor_key=TRUE)
df %>% groupby(State, Year) %>%
mutate('Age', 'Population')
We can first convert the column type to numeric by extracting the numeric part and then do the reshape
library(dplyr)
library(tidyr)
df %>%
mutate_at(vars(matches('\\d+$')), ~readr::parse_number(as.character(.))) %>%
pivot_longer(cols = -c(state, year), names_to = "Age", values_to = "Population")

Replace a value in a column based on column number

In the following script:
dataset <- read.csv("/home/adam/Desktop/Temp/lrtest.csv")
for(i in 3:ncol(dataset)){
uq <- unique(dataset[,i])
j <- i * 100
for(x in uq){
dataset[,i][dataset[,i] == x] <- j #dataset$nm[dataset$nm == x] <- j
j <- j + 1
}
}
I would like to go though each column and replace each of its String values with numbers. The problem is replacing the values (line 6) results in NA, look at the output.
How can I solve it?
The data:
Class Branch LA_type Method_type Method_call Branch_type Branch_condition Tested_parameter
Goal 12 Smooth public static never called IFNE TRUE String
TreeApp 20 Rugged constructor none IF_ICMPGE FALSE int
Password 4 Smooth private never called IFEQ FALSE int
XMLParser 9 Rugged constructor none IFNONNULL TRUE String
MapClass 33 Smooth public never called IFGT FALSE double
The output:
Class Branch LA_type Method_type Method_call Branch_type Branch_condition Tested_parameter
1 Goal 12 <NA> <NA> <NA> <NA> 700 <NA>
2 TreeApp 20 <NA> <NA> <NA> <NA> 701 <NA>
3 Password 4 <NA> <NA> <NA> <NA> 701 <NA>
4 XMLParser 9 <NA> <NA> <NA> <NA> 700 <NA>
5 MapClass 33 <NA> <NA> <NA> <NA> 701 <NA>
We can use lapply to iterate over column 3 to end of the dataframe, convert the data to factor (which it already is probably) with unique levels and add increasing sequence of 100.
df[3:ncol(df)] <- lapply(3:ncol(df), function(x)
x * 100 + as.integer(factor(df[[x]], levels = unique(df[[x]]))) - 1)
df
# Class Branch LA_type Method_type Method_call Branch_type Branch_condition
#1 Goal 12 300 400 500 600 700
#2 TreeApp 20 301 401 501 601 701
#3 Password 4 300 402 500 602 701
#4 XMLParser 9 301 401 501 603 700
#5 MapClass 33 300 403 500 604 701
# Tested_parameter
#1 800
#2 801
#3 801
#4 800
#5 802
data
df <- structure(list(Class = structure(c(1L, 4L, 3L, 5L, 2L), .Label = c("Goal",
"MapClass", "Password", "TreeApp", "XMLParser"), class = "factor"),
Branch = c(12L, 20L, 4L, 9L, 33L), LA_type = structure(c(2L,
1L, 2L, 1L, 2L), .Label = c("Rugged", "Smooth"), class = "factor"),
Method_type = structure(c(4L, 1L, 2L, 1L, 3L), .Label = c("constructor",
"private", "public", "public_static"), class = "factor"),
Method_call = structure(c(1L, 2L, 1L, 2L, 1L), .Label = c("never_called",
"none"), class = "factor"), Branch_type = structure(c(4L,
1L, 2L, 5L, 3L), .Label = c("IF_ICMPGE", "IFEQ", "IFGT",
"IFNE", "IFNONNULL"), class = "factor"), Branch_condition = c(TRUE,
FALSE, FALSE, TRUE, FALSE), Tested_parameter = structure(c(3L,
2L, 2L, 3L, 1L), .Label = c("double", "int", "String"), class = "factor")),
class = "data.frame", row.names = c(NA, -5L))

Remove NAs by ID

I am dealing with a dataset like this
Id Value Date
1 250 NA
1 250 2010-06-21
2 6 NA
2 6 2012-08-23
3 545 NA
7 3310 NA
My goal is to remove entire rows if there is an NA in Date column and ID is duplicate. The final output should look like:
Id Value Date
1 250 2010-06-21
2 6 2012-08-23
3 545 NA
7 3310 NA
df1[!(is.na(df1$Date) & duplicated(df1$Id) | duplicated(df1$Id, fromLast = TRUE)),]
# Id Value Date
#2 1 250 2010-06-21
#4 2 6 2012-08-23
#5 3 545 <NA>
#6 7 3310 <NA>
DATA
df1 = structure(list(Id = c(1L, 1L, 2L, 2L, 3L, 7L), Value = c(250L,
250L, 6L, 6L, 545L, 3310L), Date = c(NA, "2010-06-21", NA, "2012-08-23",
NA, NA)), .Names = c("Id", "Value", "Date"), class = "data.frame", row.names = c(NA,
-6L))

conditional cumulative sum using dplyr

My dataframe looks like this and I want two separate cumulative columns, one for fund A and the other for fund B
Name Event SalesAmount Fund Cum-A(desired) Cum-B(desired)
John Webinar NA NA NA NA
John Sale 1000 A 1000 NA
John Sale 2000 B 1000 2000
John Sale 3000 A 4000 2000
John Email NA NA 4000 2000
Tom Webinar NA NA NA NA
Tom Sale 1000 A 1000 NA
Tom Sale 2000 B 1000 2000
Tom Sale 3000 A 4000 2000
Tom Email NA NA 4000 2000
I have tried:
df<-
df %>%
group_by(Name)%>%
mutate(Cum-A = as.numeric(ifelse(Fund=="A",cumsum(SalesAmount),0)))%>%
mutate(Cum-B = as.numeric(ifelse(Fund=="B",cumsum(SalesAmount),0)))
but it is totally not what I want as it shows me the runningtotal of both funds,albeit only on the row when the funds match.
Kindly help.
How about:
library(dplyr)
d %>%
group_by(Name) %>%
mutate(cA=cumsum(ifelse(!is.na(Fund) & Fund=="A",SalesAmount,0))) %>%
mutate(cB=cumsum(ifelse(!is.na(Fund) & Fund=="B",SalesAmount,0)))
The output:
Source: local data frame [10 x 8]
Groups: Name
Name Event SalesAmount Fund Cum.A.desired. Cum.B.desired. cA cB
1 John Webinar NA NA NA NA 0 0
2 John Sale 1000 A 1000 NA 1000 0
3 John Sale 2000 B 1000 2000 1000 2000
4 John Sale 3000 A 4000 2000 4000 2000
5 John Email NA NA 4000 2000 4000 2000
6 Tom Webinar NA NA NA NA 0 0
7 Tom Sale 1000 A 1000 NA 1000 0
8 Tom Sale 2000 B 1000 2000 1000 2000
9 Tom Sale 3000 A 4000 2000 4000 2000
10 Tom Email NA NA 4000 2000 4000 2000
Zeroes in the resulting columns can be replaced by NA afterwards if needed:
result$cA[result$cA==0] <- NA
result$cB[result$cB==0] <- NA
Your input data set:
d <- structure(list(Name = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("John", "Tom"), class = "factor"), Event = structure(c(3L, 2L, 2L, 2L, 1L, 3L, 2L, 2L, 2L, 1L), .Label = c("Email", "Sale", "Webinar"), class = "factor"), SalesAmount = c(NA, 1000L, 2000L, 3000L, NA, NA, 1000L, 2000L, 3000L, NA), Fund = structure(c(NA, 1L, 2L, 1L, NA, NA, 1L, 2L, 1L, NA), .Label = c("A", "B"), class = "factor"), Cum.A.desired. = c(NA, 1000L, 1000L, 4000L, 4000L, NA, 1000L, 1000L, 4000L, 4000L), Cum.B.desired. = c(NA, NA, 2000L, 2000L, 2000L, NA, NA, 2000L, 2000L, 2000L)), .Names = c("Name", "Event", "SalesAmount", "Fund", "Cum.A.desired.", "Cum.B.desired." ), class = "data.frame", row.names = c(NA, -10L))
Here's an approach generalizing to more funds, using zoo and data.table:
# prep
require(data.table)
require(zoo)
setDT(d)
d[,Fund:=as.character(Fund)] # because factors are the worst
uf <- unique(d[Event=="Sale"]$Fund) # collect set of funds
First, assign cumulative sales on the relevant subset of observations:
for (f in uf) d[(Event=="Sale"&Fund==f),paste0('c',f):=cumsum(SalesAmount),by=Name]
Then, carry the last observation forward:
d[,paste0('c',uf):=lapply(.SD,na.locf,na.rm=FALSE),.SDcols=paste0('c',uf),by=Name]
You can shorten #Marat's answer slightly by rolling it all into a single mutate:
df %>%
group_by(Name) %>%
mutate(
cA = cumsum(ifelse(!is.na(Fund) & Fund == "A", SalesAmount, 0)),
cB = cumsum(ifelse(!is.na(Fund) & Fund == "B", SalesAmount, 0)),
cA = ifelse(cA == 0, NA, cA),
cB = ifelse(cB == 0, NA, cB)
)

controlling text when using add_tooltip in ggvis - r

I am trying to get more control over the text that appears when using add_tooltip in ggvis.
Say I want to plot 'totalinns' against 'avg' for this dataframe. Color points by 'country'.
The text I want to appear in the hovering tooltip would be: 'player', 'country', 'debutyear' 'avg'
tmp:
# player totalruns totalinns totalno totalout avg debutyear country
# 1 AG Ganteaume 112 1 0 1 112.00000 1948 WI
# 2 DG Bradman 6996 80 10 70 99.94286 1928 Aus
# 3 MN Nawaz 99 2 1 1 99.00000 2002 SL
# 4 VH Stollmeyer 96 1 0 1 96.00000 1939 WI
# 5 DM Lewis 259 5 2 3 86.33333 1971 WI
# 6 Abul Hasan 165 5 3 2 82.50000 2012 Ban
# 7 RE Redmond 163 2 0 2 81.50000 1973 NZ
# 8 BA Richards 508 7 0 7 72.57143 1970 SA
# 9 H Wood 204 4 1 3 68.00000 1888 Eng
# 10 JC Buttler 200 3 0 3 66.66667 2014 Eng
I understand that I need to make a key/id variable as ggvis only takes information supplied to it. Therefore I need to refer back to the original data. I have tried changing my text inside of my paste0() command, but still can't get it right.
tmp$id <- 1:nrow(tmp)
all_values <- function(x) {
if(is.null(x)) return(NULL)
row <- tmp[tmp$id == x$id, ]
paste0(tmp$player, tmp$country, tmp$debutyear,
tmp$avg, format(row), collapse = "<br />")
}
tmp %>% ggvis(x = ~totalinns, y = ~avg, key := ~id) %>%
layer_points(fill = ~factor(country)) %>%
add_tooltip(all_values, "hover")
Find below code to reproduce example:
tmp <- structure(list(player = c("AG Ganteaume", "DG Bradman", "MN Nawaz",
"VH Stollmeyer", "DM Lewis", "Abul Hasan", "RE Redmond", "BA Richards",
"H Wood", "JC Buttler"), totalruns = c(112L, 6996L, 99L, 96L,
259L, 165L, 163L, 508L, 204L, 200L), totalinns = c(1L, 80L, 2L,
1L, 5L, 5L, 2L, 7L, 4L, 3L), totalno = c(0L, 10L, 1L, 0L, 2L,
3L, 0L, 0L, 1L, 0L), totalout = c(1L, 70L, 1L, 1L, 3L, 2L, 2L,
7L, 3L, 3L), avg = c(112, 99.9428571428571, 99, 96, 86.3333333333333,
82.5, 81.5, 72.5714285714286, 68, 66.6666666666667), debutyear = c(1948L,
1928L, 2002L, 1939L, 1971L, 2012L, 1973L, 1970L, 1888L, 2014L
), country = c("WI", "Aus", "SL", "WI", "WI", "Ban", "NZ", "SA",
"Eng", "Eng")), .Names = c("player", "totalruns", "totalinns",
"totalno", "totalout", "avg", "debutyear", "country"), class = c("tbl_df",
"data.frame"), row.names = c(NA, -10L))
I think this is closer:
all_values <- function(x) {
if(is.null(x)) return(NULL)
row <- tmp[tmp$id == x$id, ]
paste(tmp$player[x$id], tmp$country[x$id], tmp$debutyear[x$id],
tmp$avg[x$id], sep="<br>")
}

Resources