Sankey plot with the riverplot package - r

Enchanté.
EDIT: Solution
As pointed out by MartineJ and emilliman5, nodes should be uniquely labelled (below).
library("riverplot")
nodes<-structure(list(ID = c("2011+", "2011-", "2016+", "2016-"), x = c(20,
20, 30, 30), y = c(50, 40, 50, 40)), class = "data.frame", row.names = c(NA,
-4L))
edges<-structure(list(N1 = c("2011+", "2011-", "2011+", "2011-"), N2 =
c("2016+", "2016-", "2016-", "2016+"), Value = c(461, 7, 0, 46)), class =
"data.frame", row.names = c(NA, -4L))
river <- makeRiver(nodes,edges)
riverplot(river)
I've been toying to plot a Sankey diagram/riverplot (using the riverplot package) of how cancer registrations evolve over time, though this code has bought me little success so far. Could anyone possibly direct me on the faults of this code?
Warning message: In checkedges(x2$edges, names(x2)) : duplicated edge information, removing 1 edges
Here is the suspect code:
library(“riverplot”)
edges<-structure(list(N1 = c("+", "-", "+", "-"), N2 = c("+", "-", "-", "+"), Value = c(664L, 50L, 0L, 46L)), .Names = c("N1", "N2", "Value"), class = "data.frame", row.names = c(NA, -4L))
nodes = data.frame(ID = unique(c(edges$N1, edges$N2)), stringsAsFactors = FALSE)
nodes$x = c(1,2)
rownames(nodes) = nodes$ID
rp <- list(nodes=nodes, edges=edges)
class(rp) <- c(class(rp), "riverplot")
plot(rp)
And the data, which is included in code:
N1 N2 Value
+ + 664
- - 50
+ - 0
- + 46
Eternally grateful.

It looks like you're using the same value multiple times in N1 (and in N2). Try to make them all different (per column) and try again, f.i.:
N1: plus1 minus1 plus2 minus2
If you want to show only + and -: in makeRiver, there is an option **node_labels **

Your nodes need to be named uniquely and then use the nodes$labels to change it back:
library(riverplot)
edges<-structure(list(N1 = c("+", "-", "+", "-"), N2 = c("+", "-", "-", "+"), Value = c(664L, 50L, 0L, 46L)), .Names = c("N1", "N2", "Value"), class = "data.frame", row.names = c(NA, -4L))
edges$N1 <- paste0(edges$N1, "a")
edges$N2 <- paste0(edges$N2, "b")
nodes = data.frame(ID = unique(c(edges$N1, edges$N2)), stringsAsFactors = FALSE)
nodes$x = c(1,1,2,2)
nodes$labels <- as.character(substr(nodes$ID, 1, 1))
rownames(nodes) = nodes$ID
rp <- list(nodes=nodes, edges=edges)
class(rp) <- c(class(rp), "riverplot")
plot(rp)

Related

How to create two stacked bar charts next to each other using ggplot. I want to recreate the below chart:

I have the below 2 dataframes:
lc2 <- structure(list(group = 1:3, sumpct = c(13, 32, 54)), class = "data.frame", row.names = c(NA,
-3L))
note this is for the "likelihood to click" bar (see image), where "extremely/somewhat likely" is
13%, neutral is 32, and extremely/somewhat unlikely is 54)
and
le2 <- structure(list(e = 1:3, t = c(13, 38, 48)), class = "data.frame", row.names = c(NA,
-3L))
note similarly this code above is for "likelihood to enroll" bar below.
But I want to create this:
lc2 <- structure(list(group = 1:3, sumpct = c(13, 32, 54)),
class = "data.frame", row.names = c(NA, -3L))
le2 <- structure(list(e = 1:3, t = c(13, 38, 48)),
class = "data.frame", row.names = c(NA, -3L))
lc2$type <- "click"
le2$type <- "enroll"
colnames(lc2) <- c("group", "pct", "type")
colnames(le2) <- c("group", "pct", "type")
library(data.table)
library(ggplot2)
dt <- rbindlist(list(lc2, le2))
dt[, group := as.factor(group)]
ggplot(dt, aes(x = type, y = pct, fill = group)) +
geom_bar(stat = "identity") +
geom_text(aes(label=scales::percent(pct/100)), position = position_stack(vjust = .5))+
theme_classic() +
coord_flip()

Naming N sublists in R

This was my first question for naming a list that contained lists that contained a dataframe:
Now my list contains even more lists, but at the end a dataframe:
So for every list level, I do have a vector, for the names these are the vectors:
spaces = 5;
low_mu0 = 0.005; high_mu0 = 0.01;
low_sigma_e = 0.0001; high_sigma_e = 0.001;
low_tau0 = 10; high_tau0 = 20;
low_phi1 = 0.5; high_phi1 = 2;
low_phi2 = -0.5; high_phi2 = 0.5;
low_sigma_n = 0.0001; high_sigma_n = 0.001;
low_c0 = -0.1; high_c0 = 0.1;
low_c1 = -0.1; high_c1 = 0.1
range_mu0 <- seq(low_mu0, high_mu0, length.out = spaces)
range_sigma_e <- seq(low_sigma_e, high_sigma_e, length.out = spaces)
range_tau0 <- seq(low_tau0, high_tau0, length.out = spaces)
range_phi1 <- seq(low_phi1, high_phi1, length.out = spaces)
range_phi2 <- seq(low_phi2, high_phi2, length.out = spaces)
range_sigma_n <- seq(low_sigma_n, high_sigma_n, length.out = spaces)
range_c0 <- seq(low_c0, high_c0, length.out = spaces)
range_c1 <- seq(low_c1, high_c1, length.out = spaces)
For naming each level I tried this:
a2 <- setNames(lapply(a, function(x)
setNames(x, nm = paste("mu =", range_mu0))),
setNames(x, paste("sigma_e =", range_sigma_e)),
setNames(x, paste("tau =", range_tau0)),
setNames(x, paste("phi1 =", range_phi1)),
setNames(x, paste("phi2 =", range_phi2)),
setNames(x, paste("sigma_n =", range_sigma_n)),
setNames(x, paste("c0 =", range_c0)),
paste("c1 =", range_c1))
But gives me an error, is there any way to have a general naming version of this question?
Thanks in advance!
data
the data is kinda massive, so I put it here
I would do this recursively. I'm using the two-level list from your previous question but it works for deeper lists as well.
set_names <- function(lst, level.names) {
if (length(level.names) > 0) {
names(lst) <- paste(names(level.names)[1], '=', level.names[[1]])
lapply(lst, set_names, level.names[-1])
} else { # deepest level
lst
}
}
# store level names in a list
lev.names <- list('y0'=seq(10, 20, length.out=3),
'sigma_e'=seq(0, 1, length.out=3))
lst <- set_names(lst, lev.names)
sapply(lst, names)
# y0 = 10 y0 = 15 y0 = 20
# [1,] "sigma_e = 0" "sigma_e = 0" "sigma_e = 0"
# [2,] "sigma_e = 0.5" "sigma_e = 0.5" "sigma_e = 0.5"
# [3,] "sigma_e = 1" "sigma_e = 1" "sigma_e = 1"
Data:
lst <- list(list(structure(list(period = 1:10, y = c(NA, 10, 10, 10,
10, 10, 10, 10, 10, 10)), class = "data.frame", row.names = c(NA,
-10L)), structure(list(period = 1:10, y = c(NA, 10, 10.7793541570746,
10.8146083527869, 10.8792522203673, 11.736784713809, 11.9672428168036,
11.3347121995003, 10.9912857735535, 10.7684547885036)), class = "data.frame", row.names = c(NA,
-10L)), structure(list(period = 1:10, y = c(NA, 10, 11.5587083141491,
11.6292167055737, 11.7585044407346, 13.4735694276179, 13.9344856336071,
12.6694243990006, 11.9825715471071, 11.5369095770071)), class = "data.frame", row.names = c(NA,
-10L))), list(structure(list(period = 1:10, y = c(NA, 15, 15,
15, 15, 15, 15, 15, 15, 15)), class = "data.frame", row.names = c(NA,
-10L)), structure(list(period = 1:10, y = c(NA, 15, 15.7793541570746,
15.8146083527869, 15.8792522203673, 16.736784713809, 16.9672428168036,
16.3347121995003, 15.9912857735535, 15.7684547885036)), class = "data.frame", row.names = c(NA,
-10L)), structure(list(period = 1:10, y = c(NA, 15, 16.5587083141491,
16.6292167055737, 16.7585044407346, 18.4735694276179, 18.9344856336071,
17.6694243990006, 16.9825715471071, 16.5369095770071)), class = "data.frame", row.names = c(NA,
-10L))), list(structure(list(period = 1:10, y = c(NA, 20, 20,
20, 20, 20, 20, 20, 20, 20)), class = "data.frame", row.names = c(NA,
-10L)), structure(list(period = 1:10, y = c(NA, 20, 20.7793541570746,
20.8146083527868, 20.8792522203673, 21.736784713809, 21.9672428168036,
21.3347121995003, 20.9912857735535, 20.7684547885036)), class = "data.frame", row.names = c(NA,
-10L)), structure(list(period = 1:10, y = c(NA, 20, 21.5587083141491,
21.6292167055737, 21.7585044407346, 23.4735694276179, 23.9344856336071,
22.6694243990006, 21.9825715471071, 21.5369095770071)), class = "data.frame", row.names = c(NA,
-10L))))
With the original data, we need a nested lapply with setNames
a2 <- setNames(lapply(a, function(x1)
setNames(lapply(x1, function(x2)
setNames(lapply(x2, function(x3)
setNames(lapply(x3, function(x4)
setNames(lapply(x4, function(x5)
setNames(lapply(x5, function(x6)
setNames(lapply(x6, function(x7)
setNames(x7, paste("c1 =", range_c1))
), paste("c0 =", range_c0))
), paste("sigma_n =", range_sigma_n))
), paste("phi2 =", range_phi2))
), paste("phi1 =", range_phi1))
), paste("tau =", range_tau0))
), paste("sigma_e =", range_sigma_e))
), paste("mu =", range_mu0))
-output

Return row(i) if two columns match

I have two datasets:
df1
ID paddock cow ID
90/123 10 09/123
90/124 11 09/124
90/125 11 09/124
df2
ID paddock
09/123 20
09/124 21
I would like to match df1$cowID with df2$ID and return df2$paddock for whatever row matches. My current code is as follows:
dt <- ifelse(df1$cowID %in% df2$ID, df2$paddock[i], NA)
But I'm getting a return error. Could someone direct me in the right direction please? Thanks in advance!
You might consider joining the datasets.
dplyr::left_join(df1, df2, by = c('cow ID', 'ID')
You should probably use match :
df1$df2_paddock <- df2$paddock[match(df1$cow_ID, df2$ID)]
df1
# ID paddock cow_ID df2_paddock
#1 90/123 10 09/123 20
#2 90/124 11 09/124 21
data
df1 <- structure(list(ID = structure(1:2, .Label = c("90/123", "90/124"
), class = "factor"), paddock = 10:11, cow_ID = structure(1:2, .Label = c("09/123",
"09/124"), class = "factor")), class = "data.frame", row.names = c(NA, -2L))
df2 <- structure(list(ID = structure(1:2, .Label = c("09/123", "09/124"
), class = "factor"), paddock = 20:21), class = "data.frame",
row.names = c(NA, -2L))
You can do that by joining the two dataframes and getting the column that you want.
Using Base R
df1 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(10, 11),
cow_ID = c("09/123", "09/124")
)
df2 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(20, 21)
)
# Joining the two dataframes by ID then choosing coloum of interest
merge(df1, df2, by = c("ID"), suffixes = c(".x", ".y"))["paddock.y"]
# paddock.y
# 20
# 21
Using Dplyr
library(dplyr)
df1 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(10, 11),
cow_ID = c("09/123", "09/124")
)
df2 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(20, 21)
)
# Joining the two dataframes by ID then choosing coloum of interest
df1 %>%
inner_join(df2, by = c("ID"), suffixes = c(".x", ".y")) %>%
select(paddock.y) %>%
rename(paddock = paddock.y)
# paddock
# 20
# 21
If you would like to use ifelse(), maybe you can use the following code to make it
with(df2,ifelse(ID %in% df1$cow_ID,paddock,NA))
such that
> with(df2,ifelse(ID %in% df1$cow_ID,paddock,NA))
[1] 20 21
DATA
df1 <- structure(list(ID = structure(1:3, .Label = c("90/123", "90/124",
"90/125"), class = "factor"), paddock = c(10, 11, 11), cow_ID = structure(c(1L,
2L, 2L), .Label = c("09/123", "09/124"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
df2 <- structure(list(ID = structure(1:2, .Label = c("09/123", "09/124"
), class = "factor"), paddock = c(20, 21)), class = "data.frame", row.names = c(NA,
-2L))

How to create differences between several pairs of columns?

I have a panel (cross-sectional time series) dataset. For each group (defined by (NAICS2, occ_type) in time ym) I have many variables. For each variable I would like to subtract each group's first (dplyr::first) value from every value of that group.
Ultimately I am trying to take the Euclidean difference between the vector of each row 's group's first entry, (i.e. sqrt(c_1^2 + ... + c_k^2).
I was able to create the a column equal to the first entries for each group:
df2 <- df %>%
group_by(ym, NAICS2, occ_type) %>%
distinct(ym, NAICS2, occ_type, .keep_all = T) %>%
arrange(occ_type, NAICS2, ym) %>%
select(group_cols(), ends_with("_scf")) %>%
mutate_at(vars(-group_cols(), ends_with("_scf")),
list(first = dplyr::first))
I then tried to include variations of f.diff = . - dplyr::first(.) in the list, but none of those worked. I googled the dot notation for a while as well as first and lag in dplyr timeseries but have not been able to resolve this yet.
Ideally, I unite all variables into a vector for each row first and then take the difference.
df2 <- df %>%
group_by(ym, NAICS2, occ_type) %>%
distinct(ym, NAICS2, occ_type, .keep_all = T) %>%
arrange(occ_type, NAICS2, ym) %>%
select(group_cols(), ends_with("_scf")) %>%
unite(vector, c(-group_cols(), ends_with("_scf")), sep = ',') %>%
# TODO: DISTANCE_BETWEEN_ENTRY_AND_FIRST
mutate(vector.diff = ???)
I expect the output to be a numeric column that contains a distance measure of how different each group's row vector is from its initial row vector.
Here is a sample of the data:
structure(list(ym = c("2007-01-01", "2007-02-01"), NAICS2 = c(0L,
0L), occ_type = c("is_middle_manager", "is_middle_manager"),
Administration_scf = c(344, 250), Agriculture..Horticulture..and.the.Outdoors_scf = c(11,
17), Analysis_scf = c(50, 36), Architecture.and.Construction_scf = c(57,
51), Business_scf = c(872, 585), Customer.and.Client.Support_scf = c(302,
163), Design_scf = c(22, 17), Economics..Policy..and.Social.Studies_scf = c(7,
7), Education.and.Training_scf = c(77, 49), Energy.and.Utilities_scf = c(25,
28), Engineering_scf = c(90, 64), Environment_scf = c(19,
19), Finance_scf = c(455, 313), Health.Care_scf = c(105,
71), Human.Resources_scf = c(163, 124), Industry.Knowledge_scf = c(265,
174), Information.Technology_scf = c(467, 402), Legal_scf = c(21,
17), Maintenance..Repair..and.Installation_scf = c(194, 222
), Manufacturing.and.Production_scf = c(176, 174), Marketing.and.Public.Relations_scf = c(139,
109), Media.and.Writing_scf = c(18, 20), Personal.Care.and.Services_scf = c(31,
16), Public.Safety.and.National.Security_scf = c(14, 7),
Religion_scf = c(0, 0), Sales_scf = c(785, 463), Science.and.Research_scf = c(52,
24), Supply.Chain.and.Logistics_scf = c(838, 455), total_scf = c(5599,
3877)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), groups = structure(list(ym = c("2007-01-01",
"2007-02-01"), NAICS2 = c(0L, 0L), occ_type = c("is_middle_manager",
"is_middle_manager"), .rows = list(1L, 2L)), row.names = c(NA,
-2L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE))

Get all.polarity value from qdap package results in R

I wanted to do sentimental analysis in R using qdap package.
It gives out a data frame containing all.all, all.wc, all.polarity, all.pos.words, all.neg.words etc.
I want to extract the values of all.polarity, all.pos.words,all.neg.words but when i use
sentiment$all.polarity or sentiment$all.pos.words,
I get NULL in result.
dput(head(sentiment))
list(structure(list(all = c("all", "all", "all"), wc = c(44L,
1L, 1L), polarity = c(-0.422115882408869, 0, 0), pos.words = list(
"-", "-", "-"), neg.words = list(c("disappointed", "issue"
), "-", "-"), text.var = c("list(list(content = \" misleaded icici bank customer care branch excutive really disappointed bank dont know steps take get issue fixed\", meta = list(author = character(0), datetimestamp = list(sec = 20.097678899765, min = 51, hour = 11, mday = 6, mon = 6, year = 115, wday = 1, yday = 186, isdst = 0), description = character(0), heading = character(0), id = \"1\", language = \"en\", origin = character(0))))",
"list()", "list()")), row.names = c(NA, -3L), .Names = c("all",
"wc", "polarity", "pos.words", "neg.words", "text.var"), class = "data.frame"),
structure(list(all = c("all", "all", "all"), wc = c(61L,
1L, 1L), polarity = c(0, 0, 0), pos.words = list("led", "-",
"-"), neg.words = list("expire", "-", "-"), text.var = c("list(list(content = \" didnt know customer banking icici years will led people looking student travel card staff mg road treat customers tried offer card wud expire one year n told get new card one year dont know\", meta = list(author = character(0), datetimestamp = list(sec = 20.3989679813385, min = 51, hour = 11, mday = 6, mon = 6, year = 115, wday = 1, yday = 186, isdst = 0), description = character(0), heading = character(0), id = \"1\", language = \"en\", origin = character(0))))",
"list()", "list()")), row.names = c(NA, -3L), .Names = c("all",
"wc", "polarity", "pos.words", "neg.words", "text.var"), class = "data.frame"),
structure(list(all = c("all", "all", "all"), wc = c(58L,
1L, 1L), polarity = c(0, 0, 0), pos.words = list("top", "-",
"-"), neg.words = list("worst", "-", "-"), text.var = c("list(list(content = \" asked staff can upgrade platinum coral card documentation fee will involoved even receiving card poeple sill keep calling top levied rs joining fee interested paying card one worst customer care experienced\", meta = list(author = character(0), datetimestamp = list(sec = 20.648964881897, min = 51, hour = 11, mday = 6, mon = 6, year = 115, wday = 1, yday = 186, isdst = 0), description = character(0), heading = character(0), id = \"1\", language = \"en\", \n origin = character(0))))",
"list()", "list()")), row.names = c(NA, -3L), .Names = c("all",
"wc", "polarity", "pos.words", "neg.words", "text.var"), class = "data.frame"),
structure(list(all = c("all", "all", "all"), wc = c(59L,
1L, 1L), polarity = c(-0.494717861727131, 0, 0), pos.words = list(
"-", "-", "-"), neg.words = list(c("long time", "long time",
"disappointed"), "-", "-"), text.var = c("list(list(content = \" applied credit card corporate scheme long time back got verification call also long time back initially getting least response executive now longer picking call neither letting know status application extremely disappointed service\", meta = list(author = character(0), datetimestamp = list(sec = 20.8989698886871, min = 51, hour = 11, mday = 6, mon = 6, year = 115, wday = 1, yday = 186, isdst = 0), description = character(0), heading = character(0), id = \"1\", \n language = \"en\", origin = character(0))))",
"list()", "list()")), row.names = c(NA, -3L), .Names = c("all",
"wc", "polarity", "pos.words", "neg.words", "text.var"), class = "data.frame"),
structure(list(all = c("all", "all", "all"), wc = c(66L,
1L, 1L), polarity = c(0.0246182981958665, 0, 0), pos.words = list(
c("work", "support"), "-", "-"), neg.words = list("disappointed",
"-", "-"), text.var = c("list(list(content = \" otp service working used work month decided change everything im getting otp sms registered mobile number ive tried contacting customer support several times keep asking send sms despite done several times several days havent received otps ever really disappointed\", meta = list(author = character(0), datetimestamp = list(sec = 21.1935319900513, min = 51, hour = 11, mday = 6, mon = 6, year = 115, wday = 1, yday = 186, isdst = 0), description = character(0), \n heading = character(0), id = \"1\", language = \"en\", origin = character(0))))",
"list()", "list()")), row.names = c(NA, -3L), .Names = c("all",
"wc", "polarity", "pos.words", "neg.words", "text.var"), class = "data.frame"),
structure(list(all = c("all", "all", "all"), wc = c(50L,
1L, 1L), polarity = c(-0.282842712474619, 0, 0), pos.words = list(
"-", "-", "-"), neg.words = list(c("pathetic", "lied"
), "-", "-"), text.var = c("list(list(content = \" pathetic service behavior icici bank facing past days icici executive lied luring upgrade debit card terms conditions just opposite booklet received told phone\", meta = list(author = character(0), datetimestamp = list(sec = 21.4258019924164, min = 51, hour = 11, mday = 6, mon = 6, year = 115, wday = 1, yday = 186, isdst = 0), description = character(0), heading = character(0), id = \"1\", language = \"en\", origin = character(0))))",
"list()", "list()")), row.names = c(NA, -3L), .Names = c("all",
"wc", "polarity", "pos.words", "neg.words", "text.var"), class = "data.frame"))
Can anyone suggest how to do this?
The following works for me -
library(qdap)
text <- "I am liking the work " # the text for which polarity score is needed
sentiment <- polarity(text) #make the call
sentiment$all$pos.words # returns the positive words detected by the algo
#[[1]]
#[1] "liking" "work"
sentiment$all$polarity # returns the sentence polarity score
#[1] 0.8944272

Resources