Related
Based on the data and code below, is it possible to sort the table in descending order?
Data (df):
structure(list(CITYNAME = c("a", "b", "c",
"d", "e", "f", "g",
"h", "i", "j", "k",
"l", "m", "n", "p", "q",
"r", "s", "t", "u",
"w", "x", "y", "z"), AvgPpt = c(127.785,
131.456, 128.357, 114.792, 131.383, 129.696, 137.008, 136.129,
132.881, 131.676, 129.103, 132.475, 122.263, 132.393, 134.552,
120.322, 125.987, 132.337, 131.18, 122.705, 123.285, 128.853,
134.494, 114.154)), row.names = c(NA, -24L), class = c("tbl_df",
"tbl", "data.frame"))
Code:
library(ggpubr)
tbl_ppt = df %>%
ggtexttable(cols = c("Municipilality", "Average Precipitaiton (mm)"),
rows = NULL,
theme = ttheme("mBlue"))
tbl_ppt
You could arrange your data in your desired order before passing it to ggtexttable:
library(ggpubr)
library(dplyr)
df %>%
arrange(desc(AvgPpt)) %>%
ggtexttable(cols = c("Municipilality", "Average Precipitaiton (mm)"),
rows = NULL,
theme = ttheme("mBlue"))
I have a table as shown.
df <- data.frame("name" = c("jack", "william", "david", "john"),
"01-Jan-19" = c(NA,"A",NA,"A"),
"01-Feb-19" = c("A","A",NA,"A"),
"01-Mar-19" = c("A","A","A","A"),
"01-Apr-19" = c("A","A","A","A"),
"01-May-19" = c(NA,"A","A","A"),
"01-Jun-19" = c("A","SA","A","SA"),
"01-Jul-19" = c("A","SA","A","SA"),
"01-Aug-19" = c(NA,"SA","A","SA"),
"01-Sep-19" = c(NA,"SA","A","SA"),
"01-Oct-19" = c("SA","SA","A","SA"),
"01-Nov-19" = c("SA","SA",NA,"SA"),
"01-Dec-19" = c("SA","SA","SA",NA),
"01-Jan-20" = c("SA","M","A","M"),
"01-Feb-20" = c("M","M","M","M"))
Over a time period, each person journeys through of position progression (3 position categories from A to SA to M). My objective is:
Calculate the average duration of A (assistant) position and SA (senior assistant) position. i.e. the duration between the date the first of one category appears, and the date the last of this category appears, regardless of missing data in between.
I transposed the data using R “gather” function
df1 <- gather (df, "date", "position", 2:15)
then I am not sure how to best proceed. What might be the best way to further approach this?
We can get the data in longer format and calculate the number of days between first date when the person was "SA" and the first date when he was "A".
library(dplyr)
df %>%
tidyr::pivot_longer(cols = -name, names_to = 'person', values_drop_na = TRUE) %>%
mutate(person = dmy(person)) %>%
group_by(name) %>%
summarise(avg_duration = person[match('SA', value)] - person[match('A', value)])
# name duration
# <fct> <drtn>
#1 david 275 days
#2 jack 242 days
#3 john 151 days
#4 william 151 days
If needed the mean value we can pull and then calculate mean by adding to the above chain
%>% pull(duration) %>% mean
#Time difference of 204.75 days
data
df <- structure(list(name = c("jack", "william", "david", "john"),
`01-Jan-19` = c(NA, "A", NA, "A"), `01-Feb-19` = c("A", "A",
NA, "A"), `01-Mar-19` = c("A", "A", "A", "A"), `01-Apr-19` = c("A",
"A", "A", "A"), `01-May-19` = c(NA, "A", "A", "A"), `01-Jun-19` = c("A",
"SA", "A", "SA"), `01-Jul-19` = c("A", "SA", "A", "SA"),
`01-Aug-19` = c(NA, "SA", "A", "SA"), `01-Sep-19` = c(NA,
"SA", "A", "SA"), `01-Oct-19` = c("SA", "SA", "A", "SA"),
`01-Nov-19` = c("SA", "SA", NA, "SA"), `01-Dec-19` = c("SA",
"SA", "SA", NA), `01-Jan-20` = c("SA", "M", "A", "M"), `01-Feb-20` = c("M",
"M", "M", "M")), row.names = c(NA, -4L), class = "data.frame")
I'm trying to perform a t.test for a specific subset of data. Say I have a data set of 116 birds, and want to find a random sample of 35 birds (non-unique) of the "Species" category. I then want to find the mean of the "Body.Mass" of these random species. Then, I want to invoke a t.test on this sample as representative of the whole data.
I first stored the data in object "bird." I tried taking the random sample using sample(bird$Species, 35), which yielded 35 random species of bird. Now I can't seem to further subset this random sample to find the means of the Body.Mass of every random sample species. I tried to subset using tidyverse, but that's the only way I'm aware of to solve a problem like this.
library(dplyr)
bird = read.csv("NZBIRDS.csv")
dput(head(bird))
set.seed(20)
sambird = sample(bird$Species,35)
sambird
bmbird <- sambird %>% summarize(avg = mean(Body.Mass))
bmbird
structure(list(Species = c("Grebes", "Grebes", "Petrels", "Petrels",
"Petrels", "Petrels"), Name = c("P. cristatus", "P. rufopectus",
"P. gavia", "P. assimilis", "P. urinatrix", "P. georgicus"),
Extinct = c("No", "No", "Yes", "Yes", "Yes", "No"), Habitat = c("A",
"A", "A", "A", "A", "A"), Nest.Site = c("G", "G", "GC", "GC",
"GC", "GC"), Nest.Density = c("L", "L", "H", "H", "H", "H"
), Diet = c("F", "F", "F", "F", "F", "F"), Flight = c("Yes",
"Yes", "Yes", "Yes", "Yes", "Yes"), Body.Mass = c(1100L,
250L, 300L, 200L, 130L, 120L), Egg.Length = c(57, 43, 57,
54, 38, 39)), .Names = c("Species", "Name", "Extinct", "Habitat",
"Nest.Site", "Nest.Density", "Diet", "Flight", "Body.Mass", "Egg.Length"
), row.names = c(NA, 6L), class = "data.frame")
Error in UseMethod("summarise_") : no applicable method for 'summarise_' applied to an object of class "factor"
It's a bit unclear whether you want to sample from a list of the unique species in the data, or sample rows so that each "Species" type can appear multiple times in the data. If you want to sample from the unique species, you can do:
# Only sampling one species since the example data
# contains only two, should work fine
# for more random species
random_species = sample(unique(bird$Species), 1, replace = FALSE)
bird %>%
filter(Species %in% random_species) %>%
group_by(Species) %>%
summarize(avg = mean(Body.Mass))
I have a data set that is over 100 columns, but for example lets suppose I have a data set that looks like
dput(tib)
structure(list(f_1 = c("A", "O", "AC", "AC", "AC", "O", "A", "AC", "O", "O"), f_2 = c("New", "New",
"New", "New", "Renewal", "Renewal", "New", "Renewal", "New",
"New"), first_dt = c("07-MAY-18", "25-JUL-16", "09-JUN-18", "22-APR-19",
"03-MAR-19", "10-OCT-16", "08-APR-19", "27-FEB-17", "02-MAY-16",
"26-MAY-15"), second_dt = c(NA, "27-JUN-16", NA, "18-APR-19",
"27-FEB-19", "06-OCT-16", "04-APR-19", "27-FEB-17", "25-APR-16",
NA), third_dt = c("04-APR-16", "21-JUL-16", "05-JUN-18", "18-APR-19",
"27-FEB-19", "06-OCT-16", "04-APR-19", "27-FEB-17", "25-APR-16",
"19-MAY-15"), fourth_dt = c("05-FEB-15", "25-JAN-16", "05-JUN-18",
"10-OCT-18", "08-JAN-19", "02-SEP-16", "24-OCT-18", "29-SEP-16",
"27-JAN-15", "14-MAY-15"), fifth_dt = structure(c(1459728000,
1469059200, 1528156800, 1555545600, 1551225600, 1475712000, 1554336000,
1488153600, 1461542400, 1431993600), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), sex = c("M", "M", "F", "F", "M", "F", "F",
"F", "F", "F")), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
Most of the date (ends_with(dt)) columns are strings, but I want to convert them into dates. I tried mutate_at but received the following:
tib %>% mutate_at(vars(ends_with("dt")), funs(parse_date_time(.))) %>% glimpse()
Error in mutate_impl(.data, dots) :
Evaluation error: argument "orders" is missing, with no default.
Any thoughts on what caused this error? Should I use a different mutate function?
As akrun noted, one of the columns is already in dttm format. Once that column is ignored the following code works for me:
tib %>%
select(-fifth_dt) %>%
mutate_at(vars(ends_with("dt")), parse_date_time, orders = "%d-%m-%y")
The funs is deprecated. In place, list can be used
library(dplyr)
tib %>%
mutate_at(3:6, list(~ parse_date_time(., "%d-%m-%y")))
I have over 800 dbf files which I need to import and merge in R. I have been able to bring in all of the files using this code:
library(foreign)
setwd("c:/temp/help/")
files <- list.files(pattern="\\.dbf$")
all.the.data <- lapply(files, read.dbf, as.is=FALSE)
DATA <- do.call("rbind",all.the.data)
However, these dbf files have different numbers of columns and even if they sometimes have the same number of columns, those headers may be different. Here are four of the dbf files to provide an example:
file01 <- structure(list(PLOTBUFFER = structure(1L, .Label = "1002_2km", class = "factor"),
VALUE_11 = 11443500, VALUE_31 = 13500, VALUE_42 = 928800,
VALUE_43 = 162000, VALUE_90 = 18900), .Names = c("PLOTBUFFER",
"VALUE_11", "VALUE_31", "VALUE_42", "VALUE_43", "VALUE_90"), row.names = c(NA,
-1L), class = "data.frame", data_types = c("C", "F", "F", "F",
"F", "F"))
file02 <- structure(list(PLOTBUFFER = structure(1L, .Label = "1002_5km", class = "factor"),
VALUE_11 = 66254400, VALUE_21 = 125100, VALUE_31 = 80100,
VALUE_41 = 4234500, VALUE_42 = 3199500, VALUE_43 = 4194000,
VALUE_52 = 376200, VALUE_90 = 72000), .Names = c("PLOTBUFFER",
"VALUE_11", "VALUE_21", "VALUE_31", "VALUE_41", "VALUE_42", "VALUE_43",
"VALUE_52", "VALUE_90"), row.names = c(NA, -1L), class = "data.frame", data_types = c("C",
"F", "F", "F", "F", "F", "F", "F", "F"))
file03 <- structure(list(PLOTBUFFER = structure(1L, .Label = "1003_2km", class = "factor"),
VALUE_11 = 1972800, VALUE_31 = 125100, VALUE_41 = 5316300,
VALUE_42 = 990900, VALUE_43 = 1995300, VALUE_52 = 740700,
VALUE_90 = 1396800, VALUE_95 = 25200), .Names = c("PLOTBUFFER",
"VALUE_11", "VALUE_31", "VALUE_41", "VALUE_42", "VALUE_43", "VALUE_52",
"VALUE_90", "VALUE_95"), row.names = c(NA, -1L), class = "data.frame", data_types = c("C",
"F", "F", "F", "F", "F", "F", "F", "F"))
file04 <- structure(list(PLOTBUFFER = structure(1L, .Label = "1003_5km", class = "factor"),
VALUE_11 = 43950600, VALUE_31 = 270000, VALUE_41 = 12969900,
VALUE_42 = 5105700, VALUE_43 = 12614400, VALUE_52 = 1491300,
VALUE_90 = 2055600, VALUE_95 = 70200), .Names = c("PLOTBUFFER",
"VALUE_11", "VALUE_31", "VALUE_41", "VALUE_42", "VALUE_43", "VALUE_52",
"VALUE_90", "VALUE_95"), row.names = c(NA, -1L), class = "data.frame", data_types = c("C",
"F", "F", "F", "F", "F", "F", "F", "F"))
I would like the dataframe to match this:
merged <- structure(list(PLOTBUFFER = structure(1:2, .Label = c("1002_2km",
"1002_5km"), class = "factor"), VALUE_11 = c(11443500, 66254400
), VALUE_21 = c(0, 125100), VALUE_31 = c(13500, 80100), VALUE_41 = c(0,
4234500), VALUE_42 = c(928800, 3199500), VALUE_43 = c(162000,
4194000), VALUE_52 = c(0, 376200), VALUE_90 = c(18900, 72000)), .Names = c("PLOTBUFFER",
"VALUE_11", "VALUE_21", "VALUE_31", "VALUE_41", "VALUE_42", "VALUE_43",
"VALUE_52", "VALUE_90"), class = "data.frame", row.names = c(NA,
-2L))
Where if there is a missing column from one dataset it simply is filled in with a zero or NULL.
Thanks
-al
The suggestion by #infominer worked for the 4 files I included as an example but when I tried to use merge_recurse on the large list of 802 elements, I received an error.
files <- list.files(pattern="\\.dbf$")
all.the.data <- lapply(files, read.dbf, as.is=FALSE)
merged <- merge_recurse(all.the.data)
Error: evaluation nested too deeply: infinite recursion / options(expressions=)?
Error during wrapup: evaluation nested too deeply: infinite recursion / options(expressions=)?
Use the package reshape
library(reshape)
merged.files <-merge_recurse(list(file01,file02,file03,file04))
Edit:
Try this code thanks to Ramnath
Reduce(function(...) merge(..., all=T),all.the.data)
adapted from https://stackoverflow.com/a/6947326/2747709