Use of purrr's "modify_if" with a function - r

I'm trying to apply the discretize_rgr function (here) of the package funModeling to multiple columns of a dataframe.
For a single column, it is working for me in this way:
discretize_rgr(input = df.div$to_be_discretized, target = df.div$TARGET, max_n_bins=10)
So, I'm trying to use the purrr package to manage multiple columns in this way:
df.div %>%
modify_if( is.numeric, ~ discretize_rgr(., target = df.div$TARGET, max_n_bins=10))
but I'm get the following error:
Error in order(fpoints_top) : argument 1 is not a vector
What's wrong?
UPDATE (example data)
structure(list(to_be_discretized = c(0.0152096300012854, 0.0132660373578711,
0.014699121782711, 0.0157102877064037, 0.0197417484744586, 0.019651999420645
), TARGET = c(27136, 30048, 34840, 138812, 191088, 240370)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L))

Related

Using data.tree package in R

I am trying to adapt the code found here to import employee data from an Excel file into a data frame and then use the as.node function from the data.tree package.
This is the code I have written so far
library(data.tree)
library(readxl)
baseframe <- read_excel("Test Emplist.xlsx")
baseframe$pathstring <- paste("CompanyName",
baseframe$LastName,
baseframe$FirstName,
sep = "/")
stafflist <- as.Node(baseframe)
The data frame is being created successfully. Below is the dput representation
> dput(head(baseframe))
structure(list(LastName = c("Vasa", "Vasa", "Pras", "Tang", "Sing",
"Vats"), FirstName = c("Evan", "Koma", "Shil", "Hand", "Smri",
"Saur"), pathstring = c("CompanyName/Vasa/Evan", "CompanyName/Vasa/Koma",
"CompanyName/Pras/Shil", "CompanyName/Tang/Hand", "CompanyName/Sing/Smri",
"CompanyName/Vats/Saur")), .Names = c("LastName", "FirstName",
"pathstring"), row.names = c(NA, 6L), class = c("tbl_df", "tbl",
"data.frame"))
but when I get to the line stafflist <- as.Node(baseframe) I am getting an error message saying
Error in strsplit(mypath, pathDelimiter, fixed = TRUE :
non-character argument
I'm guessing the as.node function calls another function called strsplit somewhere. I have tried running the function myself as so
strsplit(baseframe$pathstring, "/", fixed = TRUE)
which is running no problem. I'm not sure why the as.node function is throwing the error?

Find zero crossing in R

If I have the following data:
df <- structure(list(x = c(1.63145539094563, 1.67548187017034, 1.71950834939504,
1.76353482861975, 1.80756130784445, 1.85158778706915, 1.89561426629386,
1.93964074551856, 1.98366722474327, 2.02769370396797, 2.07172018319267,
2.11574666241738, 2.15977314164208, 2.20379962086679, 2.24782610009149,
2.2918525793162, 2.3358790585409, 2.3799055377656, 2.42393201699031,
2.46795849621501, 2.51198497543972, 2.55601145466442, 2.60003793388912,
2.64406441311383, 2.68809089233853, 2.73211737156324, 2.77614385078794,
2.82017033001265, 2.86419680923735, 2.90822328846205, 2.95224976768676,
2.99627624691146, 3.04030272613617, 3.08432920536087, 3.12835568458557,
3.17238216381028, 3.21640864303498, 3.26043512225969, 3.30446160148439,
3.3484880807091, 3.3925145599338, 3.4365410391585, 3.48056751838321,
3.52459399760791, 3.56862047683262, 3.61264695605732, 3.65667343528202,
3.70069991450673, 3.74472639373143, 3.78875287295614), y = c(24.144973858154,
18.6408277478876, 21.9174270206615, 22.8017876727379, 20.9766270378248,
18.604384256745, 18.4805250429826, 15.8436744335752, 13.6357170277296,
11.6228806771368, 9.4065868126964, 6.81644596802601, 4.41187500831424,
4.31911614349431, 0.678259284890563, -1.18632719250877, -2.32986407762089,
-3.84480566043122, -5.24738510499144, -5.20160089844013, -5.42094587600499,
-5.39886757202858, -5.26753920575326, -4.68727963638973, -2.73267203102102,
0.296905237887623, 2.45725152489283, 5.12102449689086, 7.13986218237411,
10.2044876281093, 14.4358946463429, 19.0643081865458, 22.8920445618834,
26.7229418763085, 31.3776791707576, 36.19058349817, 41.2843224331918,
46.3396522631345, 51.4321502764393, 56.4080998038294, 61.5215778808583,
66.6845421308734, 71.3912749310486, 76.0856977880158, 80.7039319129457,
84.4095953723555, 88.0163019647757, 89.918078622734, 91.6341473685881,
94.0404562451352)), class = c("tbl_df", "tbl", "data.frame"), .Names = c("x",
"y"), row.names = c(NA, -50L))
Plot:
How do I find the exact x value when y == 0? I tried doing interpolation, but it does not necessarily give me a y value equals to zero. Does anyone know of a function to find zero crossings?
Firstly, one can define a corresponding (linearly) interpolated function with
approxfun(df$x, df$y)
where the result looks like
curve(approxfun(df$x, df$y)(x), min(df$x), max(df$x))
Those zero crossing then can be seen as the roots of this function. In base R there is a function uniroot, but it looks for a single root, while in your case we have two. Hence, one option would be the rootSolve package as in
library(rootSolve)
uniroot.all(approxfun(df$x, df$y), interval = range(df$x))
# [1] 2.263841 2.727803

get rows from dataframe matching element of a list

Here are one dataframe/tibble and one character element(this element is one column of a tibble)
df1 <- structure(list(Twitter_name = c("CHESHIREKlD", "JellyComons",
"kirmiziburunlu", "erkekdeyimleri", "herosFrance", "IkishanShah"
), Declared_followers = c(60500L, 43100L, 31617L, 27852L, 26312L,
16021L), Real_followers = c(60241, 43054, 31073, 27853, 25736,
15856), Twitter_Id = c("783866366", "1424086592", "2367932244",
"3352977681", "2580703352", "521094407")), .Names = c("Twitter_name",
"Declared_followers", "Real_followers", "Twitter_Id"), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
myId <- c("867211097882804224", "868806957133688832", "549124465","822580282452754432",
"109344546", "482666188", "61716107", "3642392237", "595318933",
"833365943044628480", "1045015087", "859830740669800448", "860562940059045888",
"2854457294", "871784135983067136", "866922354554814464", "4839343547",
"849451474572759040", "872084673526214656", "794841530053853184")
N:B: df1 has been shortened and has indeed 128 observations.
I am looking to test all row elements of df1$Twitter_Id and see if they are in myId. I can run this:
> match(myId[1], df1$Twitter_Id)
but:
it stops at the first occurrence
I need to apply the match() function to all elements of myId.
I can't find a clean and simple way to do this, using lapply() or other functions from dplyr, tydiverse packages.
Thank you for help.
EDIT I need to be more explicit with the whole real case.
myTw <- structure(list(id_str = c("893445199661330433", "893116842558050304",
"892739336466305024", "892401780105019393", "892401594272296963",
"892365572486430720", "891964139756818432")), .Names = "id_str", row.names = c(NA,
-7L), class = c("tbl_df", "tbl", "data.frame"))
these are tweets ID.What I am looking for is to obtain which twitter users have retweeted these ones. To do this, I use the retweeters() function from package twitteR.
library(twitteR)
MyRtw <- retweeters(myTw[1])
MyRtw <- c("889135428028084224", "867211097882804224", "868806957133688832",
"549124465", "822580282452754432", "109344546", "482666188",
"61716107", "3642392237", "595318933", "833365943044628480",
"1045015087", "859830740669800448", "860562940059045888", "2854457294",
"871784135983067136", "866922354554814464", "4839343547", "849451474572759040",
"872084673526214656")
This is a list of Twitter user Id.
Now finally I want to see which users from df1$Twitte_Id have retweeted MyTw[1].
You can use the '%in%' operator.
Edit: Probably this is what you want. Here I used the data posted in your original post (before editing).
matchVector = NULL
for (id in df1$Twitter_Id) {
matchCounter <- sum(myId %in% id)
matchVector <- c(matchVector, matchCounter)
}
df1$numberOfMatches <- matchVector

summing integer64 columns not doing what I expect

I do not understand what is going on here. Why does sum work outside of data.table and not inside it? data.table version is 1.94 and bit64 is loaded.
dput(dt)
structure(list(Date = c(20150422L, 20150422L, 20150422L, 20150422L,
20150423L, 20150423L, 20150423L, 20150423L, 20150424L, 20150424L,
20150424L, 20150424L), totcap = structure(c(5.30519039464278e-314,
5.34352625144878e-314, 5.21151503979773e-314, 5.18159473949947e-314,
5.36659973716195e-314, 5.3767197559193e-314, 5.31749562227391e-314,
5.48717086915892e-314, 5.34891674084389e-314, 5.22243170680067e-314,
5.22969347328787e-314, 5.23636617172838e-314), class = "integer64")), .Names = c("Date",
"totcap"), class = c("data.table", "data.frame"), row.names = c(NA,
-12L), .internal.selfref = )
> sum(dt$totcap)
integer64
[1] 128782928014
> dt[,sum(totcap),by=Date]
Date V1
1: 20150422 2.104183e-313
2: 20150423 2.154799e-313
3: 20150424 2.103741e-313

How to use setorder function in R

I have a mock up data set:
d1 = structure(
list(
chan1 = c(1.49955768204777, 1.57924608282282,
1.62079872172079, 1.49955768204777,
1.50897108417039, 1.47897959168283),
chan2 = c(3.71459266186863, 3.71459266186863,
3.66763591782946, 3.67359273988532,
3.66408366995924, 3.68083665073346),
chan3 = c(8.32529316285155, 6.30229174858652,
6.97551768293611, 6.52653674461786,
6.52653674461786, 6.07823977152575),
chan4 = c(11.023719681933, 11.023719681933,
11.023719681933, 11.4613297390623,
11.4613297390623, 11.5813471428122),
chan5 = c(7.32862391337389, 7.38103675023449,
7.81796038841145, 7.4216715642288,
7.51924428352424, 7.35498863975821),
rowname = c(2042051, 1454646, 289170,
3307469, 3890829, 1741489),
total_conv = c(359.161333500186, 359.161312264452,
359.16130836516, 359.161294408793,
359.161289598969, 359.161209958641),
sum = c(31.8917871020749, 30.0008869254455,
31.1056323928309, 30.5826884698421,
30.6801655213341, 30.1743917965125)
),
.Names = c("chan1", "chan2", "chan3", "chan4", "chan5",
"rowname", "total_conv", "sum"),
class = "data.frame",
row.names = c(NA, -6L)
)
Now I need to sort this data set by total_conv and sum variables.
Here total_conv should be sort in descending order and sum in ascending order.
When I use the following function, I unable to sort my data set in required format.
d1<-setorder(as.data.table(d1),-total_conv,sum)
How can I overcome this issue?
You can also try order instead of setorder:
setDT(d1)[order(-total_conv, sum)]
It will first sort by descending total_conv and then by descending sum.

Resources