I have this data
> dff_all[1:10,c(2,3)]
cet_hour_of_registration country_id
1 20 SE
2 12 SE
3 11 SE
4 15 GB
5 12 SE
6 14 BR
7 23 MX
8 13 SE
9 1 BR
10 9 SE
and I want to create a variable $hour with the local time. The conversations are as follows The changes from CET to local time is
FI+1. MX-7. UK-1. BR-5.
I tried to do it with a nested IF. Did not make it.
#Create a data lookup table
country_id <- c("FI", "MX", "UK", "BR", "SE")
time_diff <- c(1,-7,-1,-5, 0)
df <- data.frame(country_id, time_diff)
#this is a substitute data frame for your data.
hour_reg <- c(20,12,11,15,5)
dff_all <- data.frame(country_id, hour_reg)
#joing the tables with dplyr function -> or with base join (double check join type for your needs)
library(dplyr)
new_table <- join(dff_all, df)
#make new column
mutate(new_table, hour = hour_reg - time_diff)
#output
country_id hour_reg time_diff hour
1 FI 20 1 19
2 MX 12 -7 19
3 UK 11 -1 12
4 BR 15 -5 20
5 SE 5 0 5
Base package:
# A variation of the example provided by vinchinzu
# Original table
country_id <- c("FI", "MX", "UK", "BR", "SE", "SP", "RE")
hour_reg <- c(20, 12, 11, 15, 5, 3, 7)
df1 <- data.frame(country_id, hour_reg)
# Lookup table
country_id <- c("FI", "MX", "UK", "BR", "SE")
time_diff <- c(1, -7, -1, -5, 0)
df2 <- data.frame(country_id, time_diff)
# We merge them and calculate a new column
full <- merge(df1, df2, by = "country_id", all.x = TRUE)
full$hour <- full$hour - full$time_diff
full
Output, in case we do not have that country in the lookup table, we will get NA:
country_id hour_reg time_diff hour
1 BR 15 -5 20
2 FI 20 1 19
3 MX 12 -7 19
4 RE 7 NA NA
5 SE 5 0 5
6 SP 3 NA NA
7 UK 11 -1 12
If we would like to show all rows without NA:
full[complete.cases(full), ]
To replace NA for zeros:
full[is.na(full)] <- 0
Related
I am new to R.
Currently I am working on the raw data. It includes thousands of code. I need to extract the code and number separately into small columns.
I have the data as below
df <- data.frame(num = 1:3, CD = c("1999HZ0BT", "1998HQ1ML", "1964MN3JK"))
Output I am wishing to have
df2 <- data.frame(num = 1:3, NUMBER = c(1999, 1998, 1964), VER = c(0,1,3), CD = c("HZBT", "HQML", "MNJK"))
Thank you for your help!
You could use regular expressions and Map to apply them consecutively.
res <- setNames(data.frame(df$num,
Map(function(x, y) gsub(x, y, df$CD),
c("(\\d{4}).*", ".*\\w(\\d)\\w.*", "\\d"),
c("\\1", "\\1", ""))),
c("num", "NUMBER", "VER", "CD"))
res
# num NUMBER VER CD
# 1 1 1999 0 HZBT
# 2 2 1998 1 HQML
# 3 3 1964 3 MNJK
You can use extract from tidyr :
If you want to extract data based on position
library(tidyr)
df1 <- extract(df, CD, c('NUMBER', 'CD1', 'VER', 'CD2'), '(.{4})(..)(.)(..)')
Or if you want to extract data based on characters and numbers
df1 <- extract(df, CD, c('NUMBER', 'CD1', 'VER', 'CD2'),
'(\\d+)([A-Z]+)(\\d+)([A-Z]+)')
Both of the above returns
df1
# num NUMBER CD1 VER CD2
#1 1 1999 HZ 0 BT
#2 2 1998 HQ 1 ML
#3 3 1964 MN 3 JK
You can combine CD1 and CD2 using unite
unite(df1, CD, CD1, CD2, sep = "")
# num NUMBER CD VER
#1 1 1999 HZBT 0
#2 2 1998 HQML 1
#3 3 1964 MNJK 3
Use substr:
library(dplyr)
df %>%
mutate(NUMBER = substr(CD, 1, 4),
VER = substr(CD, 7, 7),
CD = paste(substr(CD, 5, 6), substr(CD, 8, 9), sep = ""))
num CD NUMBER VER
1 1 HZBT 1999 0
2 2 HQML 1998 1
3 3 MNJK 1964 3
I am looking for a way to check wether two columns in a data frame contain the same elements for one or more rows, then eliminate the row containing more NAs.
Lets assume we have a data frame as such:
x <- data.frame("Year" = c(2017,2017,2017,2018,2018),
"Country" = c("Sweden", "Sweden", "Norway", "Denmark", "Finland"),
"Sales" = c(15, 15, 18, 13, 12),
"Campaigns" = c(3, NA, 4, 1, 1),
"Employees" = c(15, 15, 12, 8, 9),
"Satisfaction" = c(0.8, NA, 0.9, 0.95, 0.87),
"Expenses" = c(NA, NA, 9000, 7500, 4300))
Note that the entry for Sweden in the year 2017 is there twice, but the first row has one entry with NA while the other one contains NAs in three places. Now I would like to check wether two rows contain the same "Year" and "Country", then proceed to eliminate the row containing the higher amount of NAs, in this case the second row. I did some research but I could not seem to find a solution for this particular case.
Thank you very much in advance.
Using dplyr:
library(dplyr)
x %>%
mutate(n_na = rowSums(is.na(.))) %>% ## calculate NAs for each row
group_by(Year, Country) %>% ## for each year/country
arrange(n_na) %>% ## sort by number of NAs
slice(1) %>% ## take the first row
select(-n_na) ## remove the NA counter column
# A tibble: 4 x 7
# Groups: Year, Country [4]
Year Country Sales Campaigns Employees Satisfaction Expenses
<dbl> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2017 Norway 18 4 12 0.90 9000
2 2017 Sweden 15 3 15 0.80 NA
3 2018 Denmark 13 1 8 0.95 7500
4 2018 Finland 12 1 9 0.87 4300
We can use a data.table approach
library(data.table)
ind <- setDT(x)[, {
i1 <- Reduce(`+`, lapply(.SD, is.na))
.I[i1 > 0 & (i1 == max(i1))]
}, .(Year, Country)]$V1
x[-ind]
# Year Country Sales Campaigns Employees Satisfaction Expenses
#1: 2017 Sweden 15 3 15 0.80 NA
#2: 2017 Norway 18 4 12 0.90 9000
#3: 2018 Denmark 13 1 8 0.95 7500
#4: 2018 Finland 12 1 9 0.87 4300
Base R solution:
x$nas <- rowSums(sapply(x, is.na))
do.call(rbind,
by(x, x[c("Year","Country")],
function(df) head(df[order(df$nas),,drop=FALSE], n=1)))
# Year Country Sales Campaigns Employees Satisfaction Expenses nas
# 4 2018 Denmark 13 1 8 0.95 7500 0
# 5 2018 Finland 12 1 9 0.87 4300 0
# 3 2017 Norway 18 4 12 0.90 9000 0
# 1 2017 Sweden 15 3 15 0.80 NA 1
Not too surprisingly, the data.table implementation is the fast, though I"m a little surprised by how much faster it was than base R. Being a small dataset could affect this. (In the benchmarking, I had to create a copy of the original, since data.table modifies the data in-place, so x is no longer a data.frame.)
microbenchmark(
data.table = {
x0 <- copy(x)
ind <- setDT(x0)[, {
i1 <- Reduce(`+`, lapply(.SD, is.na))
.I[i1 > 0 & (i1 == max(i1))]
}, .(Year, Country)]$V1
x0[-ind]
},
dplyr = {
x %>%
mutate(n_na = rowSums(is.na(.))) %>% ## calculate NAs for each row
group_by(Year, Country) %>% ## for each year/country
arrange(n_na) %>% ## sort by number of NAs
slice(1) %>% ## take the first row
select(-n_na) ## remove the NA counter column
},
base = {
x0 <- x
x0$nas <- rowSums(sapply(x0, is.na))
do.call(rbind,
by(x0, x0[c("Year","Country")],
function(df) head(df[order(df$nas),,drop=FALSE], n=1)))
}
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# data.table 1.223477 1.441005 1.973714 1.582861 1.919090 12.837569 100
# dplyr 2.675239 2.901882 4.465172 3.079295 3.806453 42.261540 100
# base 2.039615 2.209187 2.737758 2.298714 2.570760 8.586946 100
I'm trying to create a column in a dataset that tells me the (approximate) number of months a customer has been with the company.
This is my current attempt:
dat <- data.frame(ID = c(1:4), start.date = as.Date(c('2015-04-09', '2014-03- 24', '2016-07-01', '2011-02-02')))
dat$months.customer <- apply(dat[2], 1, function(x) (as.numeric(Sys.Date())- as.numeric(x))/30)
It's returning all NAs
You can use difftime:
dat$months.customer <-
as.numeric(floor(difftime(Sys.Date(),dat$start.date,units="days")/30))
# ID start.date months.customer
# 1 1 2015-04-09 16
# 2 2 2014-03-24 29
# 3 3 2016-07-01 1
# 4 4 2011-02-02 67
I have a data table like this
ID DAYS FREQUENCY
"ads" 20 3
"jwa" 45 2
"mno" 4 1
"ads" 13 3
"jwa" 60 2
"ads" 18 3
I want to add a column that subtracts the days based on the id and subtract the closest days together.
My new table would like like this:
ID DAYS FREQUENCY DAYS DIFF
"ads" 20 3 2 (because 20-18)
"jwa" 45 2 NA (because no value greater than 45 for that id)
"mno" 4 1 NA
"ads" 13 3 NA
"jwa" 60 2 15
"ads" 18 3 5
Bonus: Is there a way to use the merge function?
Here's an answer using dplyr:
require(dplyr)
mydata %>%
mutate(row.order = row_number()) %>% # row numbers added to preserve original row order
group_by(ID) %>%
arrange(DAYS) %>%
mutate(lag = lag(DAYS)) %>%
mutate(days.diff = DAYS - lag) %>%
ungroup() %>%
arrange(row.order) %>%
select(ID, DAYS, FREQUENCY, days.diff)
Output:
ID DAYS FREQUENCY days.diff
<fctr> <int> <int> <int>
1 ads 20 3 2
2 jwa 45 2 NA
3 mno 4 1 NA
4 ads 13 3 NA
5 jwa 60 2 15
6 ads 18 3 5
You can do this using dplyr and a quick loop:
library(dplyr)
# Rowwise data.frame creation because I'm too lazy not to copy-paste the example data
df <- tibble::frame_data(
~ID, ~DAYS, ~FREQUENCY,
"ads", 20, 3,
"jwa", 45, 2,
"mno", 4, 1,
"ads", 13, 3,
"jwa", 60, 2,
"ads", 18, 3
)
# Subtract each number in a numeric vector with the one following it
rolling_subtraction <- function(x) {
out <- vector('numeric', length(x))
for (i in seq_along(out)) {
out[[i]] <- x[i] - x[i + 1] # x[i + 1] is NA if the index is out of bounds
}
out
}
# Arrange data.frame in order of ID / Days and apply rolling subtraction
df %>%
arrange(ID, desc(DAYS)) %>%
group_by(ID) %>%
mutate(days_diff = rolling_subtraction(DAYS))
I have a data set that is in a long format, and I can’t seem to get it the right shape for analysis. Perhaps this shape is appropriate — my experience has been almost entirely with wide format data, so this data file is not making sense to me. (Reproducible data file at end of the post.)
> head(df,10)
ID attributes values
1 1 AU AAA
2 1 AU BBB
3 1 YR 2014
4 2 AU CCC
5 2 AU DDD
6 2 AU EEE
7 2 AU FFF
8 2 AU GGG
9 2 YR 2013
10 3 AU HHH
The attributes column contain variables of interest to me, and I want to perform a series of aggregation functions. For example, I would like to:
1.Obtain a count of the number of authors (AU) for each ID. For example:
ID N.AU
1 2
2 5
3 1
4 2
5 5
6 1
Compute the median number of authors (AU) by year (YR)
YR Median.N.AU
2013 5.0
2014 1.5
For both of these examples, I have tried dplry with group_by and summaries, but haven’t cracked the code. I have also tried dcast. My hope is to come up with a solution that I can easily generalize to a larger data frame that has many more attributes that take on either a single value or multiple values. Any help or pointers to a similar solution would be greatly appreciated.
attributes = c("AU", "AU", "YR", "AU", "AU", "AU", "AU", "AU", "YR", "AU", "YR",
"AU", "AU", "YR", "AU", "AU", "AU", "AU", "AU", "YR", "AU", "YR")
ID = c(1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6)
values = c("AAA", "BBB", "2014", "CCC", "DDD", "EEE", "FFF", "GGG", "2013", "HHH", "2014",
"III", "JJJ", "2014", "KKK", "LLL", "MMM", "NNN", "OOO", "2013", "PPP", "2014")
df <- data.frame(ID, attributes, values)
I think you're getting confused because you actually have two tables of
data linked by a common ID:
library(dplyr)
df <- tbl_df(df)
years <- df %>%
filter(attributes == "YR") %>%
select(id = ID, year = values)
years
#> Source: local data frame [6 x 2]
#>
#> id year
#> 1 1 2014
#> 2 2 2013
#> 3 3 2014
#> 4 4 2014
#> 5 5 2013
#> .. .. ...
authors <- df %>%
filter(attributes == "AU") %>%
select(id = ID, author = values)
authors
#> Source: local data frame [16 x 2]
#>
#> id author
#> 1 1 AAA
#> 2 1 BBB
#> 3 2 CCC
#> 4 2 DDD
#> 5 2 EEE
#> .. .. ...
Once you have the data in this form, it's easy to answer the questions
you're interested in:
Authors per paper:
n_authors <- authors %>%
group_by(id) %>%
summarise(n = n())
Or
n_authors <- authors %>% count(id)
Median authors per year:
n_authors %>%
left_join(years) %>%
group_by(year) %>%
summarise(median(n))
#> Joining by: "id"
#> Source: local data frame [2 x 2]
#>
#> year median(n)
#> 1 2013 5.0
#> 2 2014 1.5
Here's a possible data.table solution
I would also suggest to create some aggregated data set with separated columns. For example:
library(data.table)
(subdf <- as.data.table(df)[, .(N.AU = sum(attributes == "AU"),
Year = values[attributes == "YR"]) , ID])
# ID N.AU Year
# 1: 1 2 2014
# 2: 2 5 2013
# 3: 3 1 2014
# 4: 4 2 2014
# 5: 5 5 2013
# 6: 6 1 2014
Calculating median per year
subdf[, .(Median.N.AU = median(N.AU)), keyby = Year]
# Year Median.N.AU
# 1: 2013 5.0
# 2: 2014 1.5
I misunderstood the structure of your dataset initially. Thanks to the comments below I realize your data needs to be restructured.
# split the data out
df1 <- df[df$attributes == "AU",]
df2 <- df[df$attributes == "YR",]
# just keeping the columns with data as opposed to the label
df3 <- merge(df1, df2, by="ID")[,c(1,3,5)]
# set column names for clarification
colnames(df3) <- c("ID","author","year")
# get author counts
num.authors <- count(df3, vars=c("ID","year"))
ID year freq
1 1 2014 2
2 2 2013 5
3 3 2014 1
4 4 2014 2
5 5 2013 5
6 6 2014 1
summaryBy(freq ~ year, data = num.authors, FUN = list(median))
year freq.median
1 2013 5.0
2 2014 1.5
The nice thing about summaryBy is that you can add in which ever function has been defined in the list and you will get another column containing the other metric (e.g. mean, sd, etc.)