Related
in (R)
for an event study I'm trying to create a column that calculates the mean of ccu_avg for a specific combination of appid and Eventdate1. One appid has multiple events so it has to be divided by both appid and Eventdate1.
The difficult thing here is that I want it to calculate the mean only up until the event date since after the event happened the estimation period stops
The new column should look like est_ccu_avg:
picture of the dataset below for explanation
https://i.stack.imgur.com/ZPquW.png
Could someone help me figure the code for this out? I've been trying for hours and can't seem to get it to work.
I've now been trying things like this but without success:
study <- study %>%
mutate(est_ccu_avg=
mean(study[unique(study$appid) | study$Eventdate1 >
study$datefinal, "ccu_avg"])
)
Result of dput head:
structure(list(appid = c("105600", "105600", "105600", "105600",
"105600", "105600"), name = c("Terraria", "Terraria", "Terraria",
"Terraria", "Terraria", "Terraria"), ccu_avg = c(26825, 29058,
37842, 37525, 26484, 24377), ccu_min = c(21176, 21620, 28954,
32880, 19648, 19118), ccu_max = c(35827, 41322, 50012, 44071,
33241, 32060), pos_max = c(356186, 356363, 356508, 356712, 356921,
357092), neg_max = c(6756, 6756, 6758, 6768, 6766, 6768), Maj_Upt =
c(0,
0, 0, 0, 0, 0), Min_Upt = c(0, 0, 0, 0, 0, 0), Hotfix = c(0,
0, 0, 0, 0, 0), Bugfix = c(0, 0, 0, 0, 0, 0), Balance = c(0,
0, 0, 0, 0, 0), ExpBranch = c(0, 0, 0, 0, 0, 0), Promo = c(0,
1, 0, 0, 0, 0), Ev_Out = c(0, 0, 0, 0, 0, 0), Ev_In = c(0, 0,
0, 0, 0, 0), isfree = c(0, 0, 0, 0, 0, 0), developers1 = c("Re-
Logic",
"Re-Logic", "Re-Logic", "Re-Logic", "Re-Logic", "Re-Logic"),
publishers1 = c("Re-Logic", "Re-Logic", "Re-Logic", "Re-Logic",
"Re-Logic", "Re-Logic"), metascore = c(83, 83, 83, 83, 83,
83), singleplayer = c(1, 1, 1, 1, 1, 1), multiplayer = c(1,
1, 1, 1, 1, 1), coop = c(1, 1, 1, 1, 1, 1), mmo = c(0, 0,
0, 0, 0, 0), indie = c(1, 1, 1, 1, 1, 1), single_player_gen = c(0,
0, 0, 0, 0, 0), adventure = c(1, 1, 1, 1, 1, 1), casual = c(0,
0, 0, 0, 0, 0), strategy = c(0, 0, 0, 0, 0, 0), rpg = c(1,
1, 1, 1, 1, 1), simulation = c(0, 0, 0, 0, 0, 0), multi_player_gen =
c(0,
0, 0, 0, 0, 0), shooter = c(0, 0, 0, 0, 0, 0), platformer = c(0,
0, 0, 0, 0, 0), ea_min = c(0, 0, 0, 0, 0, 0), ea_max = c(0,
0, 0, 0, 0, 0), scifi = c(0, 0, 0, 0, 0, 0), sports = c(0,
0, 0, 0, 0, 0), racing = c(0, 0, 0, 0, 0, 0), inappurchase = c(0,
0, 0, 0, 0, 0), workshop = c(0, 0, 0, 0, 0, 0), f_release_date =
c("May 16, 2011",
"May 16, 2011", "May 16, 2011", "May 16, 2011", "May 16, 2011",
"May 16, 2011"), l_release_date = c("May 16, 2011", "May 16, 2011",
"May 16, 2011", "May 16, 2011", "May 16, 2011", "May 16, 2011"
), datefinal = structure(c(18942, 18943, 18944, 18945, 18946,
18947), class = "Date"), Eventdate = c("", "", "", "", "",
""), Eventdate1 = structure(c(18949, 18949, 18949, 18949,
18949, 18949), class = "Date"), est_ccu_avg = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
I figured it out, there probably is an easier way but this is how I did it:
# first make a list with only the rows where eventdate > datefinal to only
include estimation period.
estmeans <- study[study$Eventdate1 > study$datefinal,]
# calculate means per appid and eventdate
studymeans <- aggregate(estmeans$ccu_avg, list(estmeans$appid,
estmeans$Eventdate1), mean)
# change the names for merging
names(studymeans)[1] <- 'appid'
names(studymeans)[2] <- 'Eventdate1'
names(studymeans)[3] <- 'est_ccu_avg'
# merge the dataframes, it creates 2 new columns, delete the empty one.
studynew <- merge(study, studymeans, by=c("appid", "Eventdate1"))
studynew$est_ccu_avg.x <- NULL
You can leverage the special .BY, to refer to the grouping variable, when using data.table
library(data.table)
setDT(df)[, mean(ccu_avg[datefinal<=.BY$Eventdate1]), by=.(appid, Eventdate1)]
The equivalent in dplyr is cur_group().
df %>%
group_by(appid,Eventdate1) %>%
summarize(res = mean(ccu_avg[datefinal<=cur_group()$Eventdate1))
This question already has answers here:
How do I dichotomise efficiently
(5 answers)
How to one hot encode several categorical variables in R
(5 answers)
Closed 9 months ago.
I am working on a project that requires me to one-hot code a single variable and I cannot seem to do it correctly.
I simply want to one-hot code the variable data$Ratings so that the values for 1,2,3 and separated in the dataframe and only equal either 0 or 1. E.g., if data$Ratings = 3 then the dummy would = 1. All the other columns are not to change.
structure(list(ID = c(284921427, 284926400, 284946595, 285755462,
285831220, 286210009, 286313771, 286363959, 286566987, 286682679
), AUR = c(4, 3.5, 3, 3.5, 3.5, 3, 2.5, 2.5, 2.5, 2.5), URC = c(3553,
284, 8376, 190394, 28, 47, 35, 125, 44, 184), Price = c(2.99,
1.99, 0, 0, 2.99, 0, 0, 0.99, 0, 0), AgeRating = c(1, 1, 1, 1,
1, 1, 1, 1, 1, 1), Size = c(15853568, 12328960, 674816, 21552128,
34689024, 48672768, 6328320, 64333824, 2657280, 1466515), HasSubtitle = c(0,
0, 0, 0, 0, 1, 0, 0, 0, 0), InAppSum = c(0, 0, 0, 0, 0, 1.99,
0, 0, 0, 0), InAppMin = c(0, 0, 0, 0, 0, 1.99, 0, 0, 0, 0), InAppMax = c(0,
0, 0, 0, 0, 1.99, 0, 0, 0, 0), InAppCount = c(0, 0, 0, 0, 0,
1, 0, 0, 0, 0), InAppAvg = c(0, 0, 0, 0, 0, 1.99, 0, 0, 0, 0),
descriptionTermCount = c(263, 204, 97, 272, 365, 368, 113,
129, 61, 87), LanguagesCount = c(17, 1, 1, 17, 15, 1, 0,
1, 1, 1), EngSupported = c(2, 2, 2, 2, 2, 2, 1, 2, 1, 2),
GenreCount = c(2, 2, 2, 2, 3, 3, 3, 2, 3, 2), months = c(7,
7, 7, 7, 7, 7, 7, 8, 8, 8), monthsSinceUpdate = c(29, 17,
25, 29, 15, 6, 71, 12, 23, 134), GameFree = c(0, 0, 0, 0,
0, 1, 0, 0, 0, 0), Ratings = c(3, 3, 3, 3, 2, 3, 2, 3, 2,
3)), row.names = c(NA, 10L), class = "data.frame")
install.packages("mlbench")
install.packages("neuralnet")
install.packages("mltools")
library(mlbench)
library(dplyr)
library(caret)
library(mltools)
library(tidyr)
data2 <- mutate_if(data, is.factor,as.numeric)
data3 <- lapply(data2, function(x) as.numeric(as.character(x)))
data <- data.frame(data3)
summary(data)
head(data)
str(data)
View(data)
#
dput(head(data, 10))
data %>% mutate(value = 1) %>% spread(data$Ratings, value, fill = 0 )
Is this what you want? I will assume your data is called data and continue with that for the data frame you supplied:
library(plm)
plm::make.dummies(data$Ratings) # returns a matrix
## 2 3
## 2 1 0
## 3 0 1
# returns the full data frame with dummies added:
plm::make.dummies(data, col = "Ratings")
## [not printed to save space]
There are some options for plm::make.dummies, e.g., you can select the base category via base and you can choose whether to include the base (add.base = TRUE) or not (add.base = FALSE).
The help page ?plm::make.dummies has more examples and explanation as well as a comparison for LSDV model estimation by a factor variable and by explicitly self-created dummies.
I am currently strugling to remove words from a large dataframe in R.
This is the df:
The first column (GeneID) contains a so called "ensembl gene ID". First one i.e. ENSG00000223972.5 followed by a "|". Afterwards, the real Gene name is listed. So i now want to remove the "ensembl gene ID" including the "|" to keep only the real gene name in this column. Is there a smart way to do this ? For example with the stringR package?
Cheers!
Edit:
> dput(head(data3))
structure(list(GeneID = c("ENSG00000223972.5|DDX11L1", "ENSG00000227232.5|WASH7P",
"ENSG00000278267.1|MIR6859-1", "ENSG00000243485.5|MIR1302-2HG",
"ENSG00000284332.1|MIR1302-2", "ENSG00000237613.2|FAM138A"),
`DC2-CD5pos-d1` = c(2, 47, 0, 0, 0, 0), `DC2-CD5pos-d2` = c(0,
41, 0, 0, 0, 0), `DC2-CD5pos-d3` = c(2, 31, 0, 0, 0, 0),
`DC2-CD5pos-d4` = c(0, 29, 0, 0, 0, 0), `DC3-d1` = c(1, 36,
0, 0, 0, 0), `DC3-d2` = c(0, 33, 0, 0, 0, 0), `DC3-d3` = c(0,
49, 0, 0, 0, 3), `DC3-d4` = c(0, 27, 0, 0, 0, 0), `DC2-BTLA-S-d1` = c(2,
4, 0, 1, 0, 0), `DC2-BTLA-S-d3` = c(6, 6, 1, 0, 0, 0), `DC2-BTLA-S-d4` = c(2,
1, 0, 0, 0, 0), `DC3-CD163-S-d1` = c(2, 8, 2, 0, 0, 0), `DC3-CD163-S-d3` = c(5,
9, 0, 0, 0, 0), `DC3-CD163-S-d4` = c(0, 5, 0, 0, 0, 0)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
I have a dataset with some variables having a binary type.
The first column are names, so when applying cluster analysis it is showing error.
kc <- kmeans(j1,4) ## j1 is the stored data frame
Error in do_one(nmeth) : NA/NaN/Inf in foreign function call (arg 1)
In addition: Warning message: In storage.mode(x) <- "double" : NAs
introduced by coercion –
The data head I am giving here using dput(j1[1:5,]:
structure(list(OUTPUT_NAME = c("nonsaturation_fba268_2ch_0_out.wav",
"nonsaturation_fba268_2ch_32_out.wav", "substreaminfo_fba268_2ch_96_out.wav",
"substreaminfo_fba268_2ch_201_out.wav", "substreaminfo_fba268_2ch_93_out.wav"
), PEAK_MIPS = c(82.47, 82.5, 82.63, 82.73, 82.73), PRESENTATION = c(0,
0, 0, 0, 0), DTHD_ATMOS_PRE = c(0, 0, 0, 0, 0), FBAFBBDETECTER = c(1,
1, 1, 1, 1), DIAL_NORM = c(31, 31, 31, 31, 31), NORMAL_DRC = c(0,
0, 0, 0, 0), ANALOG_DB_GAIN_REQ = c(0, 0, 0, 0, 0), DECODER_CH_ASSIGN = c(1,
1, 1, 1, 1), DECODER_6_CH_ASSIGN = c(1, 1, 13, 1, 1), DECODER_8_CH_ASSIGN = c(1,
1, 13, 1, 1), DECODER_16_CH_ASSIGN = c(0, 0, 0, 0, 0), CH_MODIFIER = c(0,
0, 0, 0, 0), CH_ASSIGNMENT_TYPE = c(0, 0, 0, 0, 0), FILTER_ORDER = c(0,
0, 0, 0, 0), COEFF_BITS = c(9, 9, 9, 9, 9), COEFF_SHIFT = c(7,
7, 7, 7, 7), STATE_BITS = c(4, 4, 6, 6, 6), STATE_SHIFT = c(0,
0, 0, 0, 0), `31EC_PRIMITIVE_MATRIX_CNT` = c(16, 16, 8, 8, 8),
LSB_BYPASS_COUNT = c(0, 0, 0, 0, 0), DITHER_SCALE = c(1,
1, 1, 1, 1), `31EC_FRAC_BITS` = c(14, 14, 12, 12, 12), INTERPOLATION_USED = c(1,
1, 0, 0, 0), `31EA_31EB_PRIMITIVE_MATIX_CNT` = c(0, 0, 0,
0, 0), `31EA_31EB_FRAC_BITS` = c(14, 14, 12, 12, 12), LSB_BYPASS_USED = c(0,
0, 0, 0, 0), AU_LENGTH = c(937, 937, 937, 937, 937), VARIABLE_RATE = c(1,
1, 1, 1, 1), PEAK_DATA_RATE = c(6000, 6000, 6000, 6000, 6000
), SUBSTREAM_CNT = c(1, 1, 2, 2, 2), EXTENDED_SUBSTREAM_CNT = c(0,
0, 0, 0, 0), SUBSTREAM_INFO = c(20, 20, 40, 24, 24), SPEAKER_LAYOUT = c(0,
0, 0, 0, 0), CONTROL_EN_2 = c(0, 0, 0, 0, 0), CONTROL_EN_6 = c(0,
0, 0, 0, 0), CONTROL_EN_8 = c(0, 0, 0, 0, 0), MIX_LEVEL_2 = c(35,
35, 35, 35, 35), MIX_LEVEL_6 = c(35, 35, 35, 35, 35), MIX_LEVEL_8 = c(35,
35, 35, 35, 35), DIALOGUE_NORM_2 = c(31, 31, 31, 31, 31),
DIALOGUE_NORM_6 = c(31, 31, 31, 31, 31), DIALOGUE_NORM_8 = c(31,
31, 31, 31, 31), SOURCE_FORMAT_6 = c(0, 0, 0, 0, 0), SOURCE_FORMAT_8 = c(0,
0, 0, 0, 0), DRC_STARTUP_GAIN = c(0, 0, 0, 0, 0), DIALOGUE_NORM_16 = c(28,
28, 31, 31, 31), MIX_LEVEL_16 = c(35, 35, 35, 35, 35), CHANNEL_CNT_16 = c(16,
16, 16, 16, 16), DYNAMIC_OBJ_ONLY = c(1, 1, 1, 1, 1), DYNAMIC_CHANNEL_CNT_16 = c(0,
0, 0, 0, 0), LFE_PRE = c(1, 1, 0, 0, 0), CHANNEL_CONTENT_DES_16 = c(0,
0, 0, 0, 0), MIN_CHAN = c(0, 0, 0, 0, 0), MAX_CHAN = c(1,
1, 1, 1, 1), RESTART_SYNC_WORD = c(12778, 12778, 12778, 12778,
12778), MAX_MATRIX_CHAN = c(1, 1, 1, 1, 1), DITHER_SHIFT = c(0,
0, 0, 0, 0), ERROR_PROTECT = c(1, 1, 1, 1, 1), LOSSLESS_PROTECT = c(0,
0, 1, 1, 1), BLOCK_SIZE = c(32, 32, 40, 40, 40), OUTPUT_SHIFT = c(0,
0, 0, 0, 0), QUANT_STEP_SIZE = c(0, 0, 0, 0, 0), HUFF_OFFSET = c(0,
0, 0, 0, 0), HUFF_TYPE = c(1, 1, 0, 2, 2), HUFF_LSBS = c(6,
6, 8, 5, 5), SAMPLE_RATE = c(0, 3, 0, 3, 0), OUTPUT_SAMPLE_COUNT = c(40,
40, 40, 40, 40), RESTART_HEADER_EXISTS = c(0, 0, 0, 0, 0)), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
You're using a variable that is not numeric, look at this:
class(j1[,1])
[1] "character"
You've to remove it, to make kmeans works:
set.seed(1234)
kmeans(j1[,-1],2)
I have the raw totals for three values that I was looking to display over time in a stacked bar chart, but I don't know how to display this.
I have the percentage values (.22, et cetera), and the raw numbers.
How would I create a stacked bar chart using ggplot2 considering I have three proportions I am trying to graph. Do I need to melt the data?
I would like to do something like:
ggplot(data, aes(fill=condition, y=value, x=specie)) +
geom_bar( stat="identity", position="fill")
But I do not know how to do this as my data isn't formatted right. Should I use dplyr?
Here is my df:
structure(list(date = structure(c(17405, 17406, 17407, 17408,
17409, 17410, 17411, 17412, 17413, 17414), class = "Date"), total_membership = c(1,
1, 1, 1, 1, 188, 284, 324, 354, 390), full_members = c(1, 1,
1, 1, 1, 188, 284, 324, 354, 390), guests = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0), daily_active_members = c(1, 1, 1, 1, 1, 169,
225, 214, 203, 254), daily_members_posting_messages = c(1, 0,
1, 0, 1, 111, 110, 96, 67, 70), weekly_active_members = c(1,
1, 1, 1, 1, 169, 270, 309, 337, 378), weekly_members_posting_messages = c(1,
1, 1, 1, 1, 111, 183, 218, 234, 255), messages_in_public_channels = c(4,
0, 0, 0, 1, 252, 326, 204, 155, 135), messages_in_private_channels = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), messages_in_shared_channels = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), messages_in_d_ms = c(1, 0, 0, 0,
0, 119, 46, 71, 70, 122), percent_of_messages_public_channels = c(0.8,
0, 0, 0, 1, 0.6792, 0.8763, 0.7418, 0.6889, 0.5253), percent_of_messages_private_channels = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), percent_of_messages_d_ms = c(0.2,
0, 0, 0, 0, 0.3208, 0.1237, 0.2582, 0.3111, 0.4747), percent_of_views_public_channels = c(0.2857,
1, 1, 1, 1, 0.8809, 0.9607, 0.945, 0.9431, 0.9211), percent_of_views_private_channels = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), percent_of_views_d_ms = c(0.7143,
0, 0, 0, 0, 0.1191, 0.0393, 0.055, 0.0569, 0.0789), name = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), public_channels_single_workspace = c(10,
10, 11, 11, 12, 12, 12, 13, 13, 13), messages_posted = c(35,
35, 37, 38, 66, 1101, 1797, 2265, 2631, 3055)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
Here is an example using a toy data set, where the original data are first grouped and summarised to get the 'proportions', then piped to ggplot, which will automatically create a stacked bar plot
df <- data.frame(group=sample(letters[1:10],1000,T),
species=sample(1:4,1000,T),
amount=sample(10:30,1000,T))
df %>% group_by(group,species) %>% summarise(perc=mean(amount)) %>%
ggplot(aes(group,perc,fill=factor(species))) +
geom_bar(stat='identity')
UPDATE
This will calculate the proportion that 'species' occurs within each 'group'.
df %>% group_by(group,species) %>% summarise(n=n()) %>%
group_by(group) %>% mutate(perc=n/sum(n)) %>%
ggplot(aes(group,perc,fill=factor(species))) +
geom_bar(stat='identity')