Drop lowest numeric value - r

I've been stuck on this dplyr manipulation issue for a while now.
Here is a small sample size of my data: dput(test)
structure(list(anon_screen_name = c("40492fd6e817cc25cea942be9eae7c1c5795ffa1",
"862329793fdbcd666d660d9a9d2e3beceb07a0db", "862329793fdbcd666d660d9a9d2e3beceb07a0db",
"862329793fdbcd666d660d9a9d2e3beceb07a0db", "862329793fdbcd666d660d9a9d2e3beceb07a0db",
"862329793fdbcd666d660d9a9d2e3beceb07a0db", "862329793fdbcd666d660d9a9d2e3beceb07a0db",
"862329793fdbcd666d660d9a9d2e3beceb07a0db", "a9c8719499b9ef73c78e85bada231591d807a821",
"a9c8719499b9ef73c78e85bada231591d807a821"), resource_display_name = c("Quiz",
"Quiz", "Quiz", "Quiz", "Quiz", "homework", "homework", "final_exam",
"Quiz", "Quiz"), grade = c(0L, 0L, 0L, 3L, 1L, 0L, 1L, 1L, 1L,
2L), max_grade = c(2L, 1L, 0L, 3L, 1L, 10L, 11L, 1L, 1L, 2L),
percent_grade = c("0", "0", "\\N", "100", "100", "0", "9.09",
"100", "100", "100")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -10L))
Basically, for each anon_screen_name, I want to drop the lowest percent_grade for the homework (in resource_display_name).
I started to write this starter code:
test %>%
mutate(percent_grade = as.numeric(percent_grade)) %>%
group_by(resource_display_name) %>%
summarise(min_percent_grade = min(percent_grade, na.rm = T))
But this only shows me the minimum homework grade without taking out the row with the minimum homework grade
UPDATE:
Basically, borrowing from a comment below, I want to remove the row(s) associated with the lowest value of percent_grade where resource_display_name == 'homework'

Try following codes:
test %>%
mutate(percent_grade = as.numeric(percent_grade)) %>%
filter(resource_display_name == 'homework') %>%
filter(percent_grade > min(percent_grade, na.rm = T)) -> t1
test %>%
mutate(percent_grade = as.numeric(percent_grade)) %>%
filter(resource_display_name != 'homework') -> t2
rbind(t1,t2)

The following will remove all values equal to the minimum per group of resource_display_name. Note that it's a base R solution, there is no need for an external package such as dplyr.
inx <- with(test, ave(as.numeric(percent_grade), resource_display_name, FUN = function(x) x != min(x, na.rm = TRUE)))
inx <- which(as.logical(inx))
test[inx, ]

If you only want to remove a single record, and not all records with the lowest grade, you could do something like the following.
test %>%
mutate(percent_grade = as.numeric(percent_grade)) %>%
group_by(anon_screen_name) %>%
mutate(lowest_grade = 1 * ((percent_grade == min(percent_grade, na.rm=TRUE)) & (resource_display_name == 'homework'))) %>%
arrange(lowest_grade) %>%
filter(row_number() != n()) %>%
ungroup()

Related

How to Plot line chart using R for time-series analysis

I am trying to plot a line chart using Date-time and no of tweets at that period of date and time in R.
library(ggplot2)
df1 <- structure(list(Date = structure(c(1L, 1L, 2L, 1L, 1L, 1L), .Label = c("2020-03-12",
"2020-03-13"), class = "factor"), Time = structure(c(1L, 1L, 2L,
3L, 4L, 5L), .Label = c("00:00:00Z", "00:00:01Z", "00:10:04Z",
"00:25:12Z", "01:00:02Z"), class = "factor"), Text = structure(c(5L,
3L, 6L, 4L, 2L, 1L), .Label = c("The images of demonstrations and gathering", "Premium policy get activate by company abc",
"Launches of rocket", "Premium policy get activate by company abc",
"Technology makes trend", "The images of demonstrations and gatherings",
"Weather forecasting by xyz"), class = "factor")), class = "data.frame", row.names = c(NA,
-6L))
ggplot(df1, aes(x = Date, y = text(count)) + geom_line(aes(color = variable), size = 1)
I tried the above code to plot desired result but got an error. Dataset given like that in csv format.
Date Time Text
2020-03-12 00:00:00Z The images of demonstrations and gatherings
2020-03-12 00:00:00Z Premium policy get activate by company abc
2020-03-12 00:00:01Z Weather forecasting by xyz
2020-03-12 00:10:04Z Technology makes trend
2020-03-12 00:25:12Z Launches of rocket
2020-03-12 01:00:02Z Government launch new policy to different sector improvement
I have a dataset of nearly 15 days and want to plot the line chart to visualize the number of tweets (given in text column) to see the trend of tweets on different time and date.
df1 <- structure(list(Date = structure(c(1L, 1L, 2L, 1L, 1L, 1L), .Label = c("3/12/2020",
"3/13/2020"), class = "factor"), Time = structure(c(1L, 1L, 2L,
3L, 4L, 5L), .Label = c("00:00:00Z", "00:00:01Z", "00:10:04Z",
"00:25:12Z", "01:00:02Z"), class = "factor"), Text = structure(c(5L,
3L, 6L, 4L, 2L, 1L), .Label = c("Government launch new policy to different sector",
"Launches of rocket", "Premium policy get activate by company abc",
"Technology makes trend", "The images of demonstrations and gatherings",
"Weather forecasting by xyz"), class = "factor"), X = structure(c(1L,
1L, 1L, 1L, 1L, 2L), .Label = c("", "improvement"), class = "factor")), class = "data.frame", row.names = c(NA,
-6L))
Creating the dataset df1 as above then running this gives you required plot for hour
library(tidyverse)
library(lubridate)
df1 %>%
mutate(Time=hms(Time),
Date=mdy(Date),
hour=hour(Time)) %>%
count(hour) %>%
ggplot(aes(hour,n,group=1))+geom_line()+geom_point()
Is this what you are after?
library(dplyr)
library(lubridate)
library(stringr)
library(ggplot2)
Answer with your data
To demonstrate data wrangling.
# your data;
df1 <- structure(list(Date = structure(c(1L, 1L, 2L, 1L, 1L, 1L),
.Label = c("2020-03-12","2020-03-13"),
class = "factor"),
Time = structure(c(1L, 1L, 2L,3L, 4L, 5L),
.Label = c("00:00:00Z", "00:00:01Z", "00:10:04Z","00:25:12Z", "01:00:02Z"),
class = "factor"),
Text = structure(c(5L,3L, 6L, 4L, 2L, 1L),
.Label = c("The images of demonstrations and gathering", "Premium policy get activate by company abc",
"Launches of rocket", "Premium policy get activate by company abc",
"Technology makes trend", "The images of demonstrations and gatherings", "Weather forecasting by xyz"), class = "factor")),
class = "data.frame", row.names = c(NA,-6L))
# data wrangle
df2 <-
df1 %>%
# change all variables from factors to character
mutate_all(as.character) %>%
mutate(Time = str_remove(Time, "Z$"), #remove the trailing 'Z' from Time values
dt = ymd_hms(paste(Date, Time, sep = " ")), # change text into datetime format using lubridtate::ymd_hms
dt = ceiling_date(dt, unit="hour")) %>% # round to the end of the named hour, separated for clarity
group_by(dt) %>%
summarise(nr_tweets = n())
# plot
p1 <- ggplot(df2, aes(dt, nr_tweets))+
geom_line()+
scale_x_datetime(date_breaks = "1 day", date_labels = "%d/%m")+
ggtitle("Data from question `df1`")
Answer with made up large dataset
tib <- tibble(dt = sample(seq(ISOdate(2020,05,01), ISOdate(2020,05,15), by = "sec"), 10000, replace = TRUE),
text = sample(c(letters[1:26], LETTERS[1:26]), 10000, replace = TRUE))
tib1 <-
tib %>%
mutate(dt = round_date(dt, unit="hour"))%>%
group_by(dt) %>%
summarise(nr_tweets = n())
p2 <- ggplot(tib1, aes(dt, nr_tweets))+
geom_line()+
scale_x_datetime(date_breaks = "1 day", date_labels = "%d/%m")+
ggtitle("Result using `tib` data made up to answer the question")
p1/p2
Created on 2020-05-13 by the reprex package (v0.3.0)

R left join by multiple columns resulting in NAs

I have two data frames alpha and beta.
dput(alpha)
structure(list(ID = c(29503L, 29507L, 29508L, 29510L),
Q_ID = structure(1:4, .Label = c("q:1392763916495:441", "q:1392763916495:445", "q:1392763916495:449", "q:1392763920794:458"),
class = "factor"),
L_Atmpt = c(0L, 0L, 0L, 0L),
Q_Atmpt = c(0L, 1L, 0L, 1L),
Q_Result = c(1L, 1L, 1L, 0L),
Time_on_Screen = c(13839L, 185162L, 264418L, 2183464L),
Start_Time = structure(1:4, .Label = c("2017-10-31Ê11:51:20", "2017-10-31Ê11:54:26", "2017-10-31Ê11:59:09", "2017-10-31Ê12:35:34"),
class = "factor"),
End_Time = structure(1:4, .Label = c("2017-10-31Ê11:51:33", "2017-10-31Ê11:57:31", "2017-10-1Ê12:03:33", "2017-10-31Ê13:11:57"),
class = "factor"),
Duration = c(173L, 55L, 98L, 1921L)),
class = "data.frame", row.names = c(NA, -4L))
dput(beta)
structure(list(ID = c(29503L, 29507L, 29508L, 29510L, 29515L, 30160L),
Q_ID = structure(1:6, .Label = c("q:1392763916495:441", "q:1392763916495:445", "q:1392763916495:449", "q:1392763920794:458", "q:1392763920794:462", "q:1392763925803:530"),
class = "factor"),
L_Atmpt = c(0L, 0L, 0L, 0L, 0L, 1L),
Q_Atmpt = c(0L, 1L, 0L, 1L, 0L, 0L),
Q_Result = c(1L, 1L, 1L, 0L, 0L, 0L),
Time_on_Screen = c(13839L, 185162L, 264418L, 2183464L, 768470L, 885800L),
Start_Time = structure(c(2L, 3L, 4L, 5L, 6L, 1L), .Label = c("2017-10-25Ê00:19:08", "2017-10-31Ê11:51:20", "2017-10-31Ê11:54:26", "2017-10-31Ê11:59:09", "2017-10-31Ê12:35:34", "2017-10-31Ê13:16:09"),
class = "factor"),
End_Time = structure(c(2L, 3L, 4L, 5L, 6L, 1L), .Label = c("2017-10-25Ê00:33:53", "2017-10-31Ê11:51:33", "2017-10-31Ê11:57:31", "2017-10-31Ê12:03:33", "2017-10-31Ê13:11:57", "2017-10-31Ê13:28:57"),
class = "factor")),
class = "data.frame", row.names = c(NA,-6L))
I want to merge them and get a final data frame gamma. The data frame alpha has a special column: alpha$duration, which I need to add or append at the end of the data frame beta.
beta has more instances than alpha and I want to perform left join so all the instances of beta are retained. This means that some of the entries of the column gamma$duration will be NULL or NA.
I expect, the NULLs or NAs will be those entries where the ID of alpha does not match with the ID of beta. However, for my original data (which has more than 10K rows and around 20 or so variables), I get something like below:
ID Q_ID L_Atmpt Q_Atmpt Q_Result Time_on_Screen Start_Time End_Time Duration
29503 q:1392763916495:441 0 0 1 13839 2017-10-31Ê11:51:20 2017-10-31Ê11:51:33 NA
29507 q:1392763916495:445 0 1 1 185162 2017-10-31Ê11:54:26 2017-10-31Ê11:57:31 NA
29508 q:1392763916495:449 0 0 1 264418 2017-10-31Ê11:59:09 2017-10-31Ê12:03:33 NA
29510 q:1392763920794:458 0 1 0 2183464 2017-10-31Ê12:35:34 2017-10-31Ê13:11:57 NA
29515 q:1392763920794:462 0 0 0 768470 2017-10-31Ê13:16:09 2017-10-31Ê13:28:57 NA
30160 q:1392763925803:530 1 0 0 885800 2017-10-25Ê00:19:08 2017-10-25Ê00:33:53 NA
Unfortunately, the toy example that I shared is not replicating/capturing my problem. I understand it could be challenging to imagine why I am getting NA in my original problem. But any thoughts or advice on this would be highly appreciated.
For reference, I am sharing the different scripts that I have used, they all have rendered the same output:
library(plyr)
gamma = join(beta, alpha, type = "left")
library(dplyr)
gamma = left_join(beta, alpha)
library(sqldf)
gamma = sqldf('SELECT beta.*, alpha.duration
FROM beta LEFT JOIN alpha
on beta.ID == alpha.ID AND
beta.Q_ID == alpha.Q_ID AND
beta.L_Atmpt == alpha.L_Atmpt AND
beta.Q_Atmpt == alpha.Q_Atmpt AND
beta.Start_Time == alpha.Start_Time')
I would like to mention that the column alpha$duration in my original data frame was created after some pre-processing steps such as:
#Step 1: Ordering the data by ID and Start_Time
beta = beta[with(beta, order(ID, Q_ID, Q_Atmpt, Start_Time)), ]
#Step 2: End_Time lagging
library(Hmisc)
# to calculate the time difference we lag the End_Time
beta$End_Time_forward = Lag(beta$End_Time, +1)
# for comparisons, we also lag the IDs
beta$ID_forward = Lag(beta$ID, +1)
#Step 3: Now calculate the required time differences
library(sqldf)
alpha = sqldf('SELECT beta.*,
(Start_Time - End_Time_forward),
(End_Time - End_Time_forward)
FROM beta
WHERE ID_forward == ID')
#Step 4: Columns renaming
names(alpha)[names(alpha) == "(Start_Time - End_Time_forward)"] = "duration"
names(alpha)[names(alpha) == "(End_Time - End_Time_forward)"] = "end_duration"
#Step 5:Few instances have negative duration, so replace the gap between
# (last end time and current start time) with the (last end time and current
# end time) difference
alpha = alpha %>%
mutate(duration = if_else(duration < 0, end_duration, duration))
#Step 6: Convert the remaining negatives with NAs
alpha$duration[alpha$duration < 0] <- NA
#Step 7: Now replace those NAs by using the imputeTS function
library(imputeTS)
alpha$duration = na_locf(alpha$duration, option = 'locf',
na_remaining = 'rev', maxgap = Inf)
I suspect, the last two steps where I have manipulated the gamma$duration variable might have something to do with such unexpected results
I have not been able to determine the actual cause of this issue, however, I have found a work-around this problem:
beta$duration = as.integer(0)
test2 = merge(x = beta, y = alpha,
by = c("ID", "Q_ID", "L_Atmpt", "Q_Atmpt", "Q_Result", "Time_on_Screen", "Start_Time", "End_Time"),
all.x = TRUE)
Through this, I can access/retain the duration column of the data frame alpha and then use it as I want to.

Apply changes (group by) on multiple dataframes using for loop

I have around 33 dataframes (df1, df2, df3, df4 ...) that look like this:
Date Month Value
2018-07-16 2018-07 10
2018-07-17 2018-07 2
2018-07-18 2018-07 4
2018-07-19 2018-07 45
2018-07-20 2018-07 13
and I would like to group each data frame by month, like this:
df1 = df1 %>% group_by(Month)%>%
summarise(
sd_value = sd(value)
)
How can I do this on all dataframes without repeating it?
Also, I will need to export the results as separate data frames.
I've tried to duplicate some other people's solutions using for loop but doesn't work.
Also, I have all the dataframes separately in my Environment, they are not in a list.
You can get them in list using mget with your pattern, loop over them using lapply and then aggregate
list_name <- ls(pattern = "df\\d+")
list_df <- lapply(mget(list_name), function(x) aggregate(Value~Month, x, sd))
list_df
#$df1
# Month Value
#1 2018-07 17.45566
#$df2
# Month Value
#1 2018-07 185.8744
Or if you want to use tidyverse
library(tidyverse)
list_df <- map(mget(list_name),
. %>% group_by(Month) %>% summarise(sd_value = sd(Value)))
To write them in separate csv's we can use mapply
mapply(function(x, y) write.csv(x,
paste0("path/to/file/", y, ".csv"), row.names = FALSE), list_df, list_name)
data
df1 <- structure(list(Date = structure(1:5, .Label = c("2018-07-16",
"2018-07-17", "2018-07-18", "2018-07-19", "2018-07-20"), class = "factor"),
Month = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "2018-07", class = "factor"),
Value = c(10L, 2L, 4L, 45L, 13L)), class = "data.frame", row.names =
c(NA, -5L))
df2 <- structure(list(Date = structure(1:5, .Label = c("2018-07-16",
"2018-07-17", "2018-07-18", "2018-07-19", "2018-07-20"), class = "factor"),
Month = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "2018-07", class = "factor"),
Value = c(11L, 2L, 4L, 423L, 13L)), class = "data.frame", row.names =
c(NA, -5L))
We can use data.table methods
library(data.table)
lapply(mget(ls(pattern = "df\\d+")), function(x)
setDT(x)[, .(sd_value = sd(Value)), by = Month])

How do I write a function to manipulate several dataframes the same way?

Complete novice. I do not know how to write a function. I have several dataframes that all need to be manipulated in the same way and the output should be dataframes with the same names. I have functioning code that can manipulate a single dataframe. I would like to be able to manipulate several at once.
Here are 2 example df's:
ex1 <- structure(list(info1 = c("Day", "2018.04.03 10:47:33", "2018.04.03 11:20:04", "2018.04.03 11:35:04"), info2 = c("Status_0", "Ok", "Ok", "Ok"
), X = c(200L, 1L, 2L, 3L), X.1 = c(202.5, 1, 2, 3), X.2 = c(205L,
1L, 2L, 3L), X.3 = c(207.5, 1, 2, 3), X.4 = c(210L, 1L, 2L, 3L
), X.5 = c(212.5, 1, 2, 3), X.6 = c(215L, 1L, 2L, 3L)), class = "data.frame", row.names = c(NA, -4L))
ex2 <- structure(list(info1 = c("Day", "2018.04.10 12:47:33", "2018.04.10 13:20:04", "2018.04.10 13:35:04"), info2 = c("Status_0", "Ok", "Ok", "Ok"
), X = c(200L, 1L, 2L, 3L), X.1 = c(202.5, 1, 2, 3), X.2 = c(205L,
1L, 2L, 3L), X.3 = c(207.5, 1, 2, 3), X.4 = c(210L, 1L, 2L, 3L
), X.5 = c(212.5, 1, 2, 3), X.6 = c(215L, 1L, 2L, 3L)), class = "data.frame", row.names = c(NA, -4L))
Here is the functioning code to manipulate 'ex1'
library(tidyverse)
library(lubridate)
colnames(ex1) <- ex1[1,]
ex1 <- ex1 %>%
slice(-1) %>%
rename(Date.Time = "Date/Time") %>%
mutate(timestamp = parse_date_time(Date.Time, "%Y.%m.%d %H:%M:%S")) %>%
select(timestamp, Date.Time, everything()) %>% select(-Date.Time) %>%
select(-c(Status_0:"202.5", "212.5":"215"))
colnames(ex1)[-1] <- paste("raw", colnames(ex1)[-1], sep = "_")
Secondary question: let's say I wanted to change the function so it accepted a df, but also a type (i.e., raw or comp) and the function input would be tidydatafunc(df, type). If I input type=comp it would change the last line of the code where I have "raw" to "comp". How could I change the function to accomodate this?
Any help is greatly appreciated. I'm sure this is basic stuff for most of you!
Wrap your script in function and specify params.
my_fun <- function(df, type = 'comp') {
# basic input validation is extremely useful
stopifnot(is.data.frame(df))
stopifnot(is.character(type))
colnames(df) <- df[1,]
ex1 <- df %>%
slice(-1) %>%
rename(Date.Time = "Date/Time") %>%
mutate(timestamp = parse_date_time(Date.Time, "%Y.%m.%d %H:%M:%S")) %>%
select(timestamp, Date.Time, everything()) %>% select(-Date.Time) %>%
select(-c(Status_0:"202.5", "212.5":"215"))
# pass the character type
colnames(df)[-1] <- paste(type, colnames(df)[-1], sep = "_")
return(df)
}
Then you can use it.
my_fun(ex1, "comp") # view
new_ex1 <- my_fun(ex1, "comp") # save to variable new_ex1

how to find similar strings within a data

My data looks like this
df<- structure(list(A = structure(c(7L, 6L, 5L, 4L, 3L, 2L, 1L, 1L,
1L), .Label = c("", "P42356;Q8N8J0;A4QPH2", "P67809;Q9Y2T7",
"Q08554", "Q13835", "Q5T749", "Q9NZT1"), class = "factor"), B = structure(c(9L,
8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("P62861", "P62906",
"P62979;P0CG47;P0CG48", "P63241;Q6IS14", "Q02413", "Q07955",
"Q08554", "Q5T749", "Q9UQ80"), class = "factor"), C = structure(c(9L,
8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("", "P62807;O60814;P57053;Q99879;Q99877;Q93079;Q5QNW6;P58876",
"P63241;Q6IS14", "Q02413", "Q16658", "Q5T750", "Q6P1N9", "Q99497",
"Q9UQ80"), class = "factor")), .Names = c("A", "B", "C"), class = "data.frame", row.names = c(NA,
-9L))
I want to count how many elements are in each columns including those that are separated with a ; , for example in this case
first column has 9, second column has 12 elements and the third column has 16 elements. then I want to check how many times a element is repeated in other columns . for example
string number of times columns
Q5T749 2 1,2
then remove the strings which are seen more than once from the df
One way to approach this is to start by re-organizing the data into a form that is more convenient to work with. The tidyr and dplyr packages are useful for that sort of thing.
library(tidyr)
df$index <- 1:nrow(df)
df <- gather(df, key = 'variable', value = 'value', -index, na.rm = TRUE)
df <- separate(df, "value", into = paste("x", 1:(1 + max(nchar(gsub("[^;]", "", df$value)))), sep = ""), sep = ";", fill = "right")
df <- gather(df, "which", "value", -index, -variable)
Once you do that counting each element is easy:
addmargins(t(table(df[, c("variable", "value")])), margin = 2)
Dropping duplicates is also easy.
df <- df[!duplicated(df$value), ]
If you really want to put the data back into the original for you can (though I don't recommend it).
df <- spread(df, key = "variable", value = "value")
library(dplyr)
summarize(group_by(df, index),
A = paste(na.omit(A), collapse = ";"),
B = paste(na.omit(B), collapse = ";"),
C = paste(na.omit(C), collapse = ";"))
For the count of elements in each column use this
sapply(df,function(x) length(unlist(sapply(strsplit(as.character(x),"\\s+"),strsplit,split=";"))))
For counting the repetition use this
words <- lapply(df,function(x) unlist(sapply(strsplit(as.character(x),"\\s+"),strsplit,split=";")))
dup_table <- table(unlist(words))
dup_table
There is a very bad approach to remove the repetition
pat <- names(dup_table)[unname(dup_table)>1]
for(i in pat)
df <- as.data.frame.list(lapply(df,function(x) gsub(pattern = i,replacement = "",x)))
But, there is only one problem. It will replace all the occurences of a particular pattern.

Resources