Plotting Y (country) against X (years) showing GDP growth of 24 countries YoY (line graph) - r

So I am not sure what I am doing wrong but I keep encountering errors. So my aim of my project is to narrow down selection of countries based on the variable I feel that make sense to me hence I tried to cut down bit by bit based on my code below (pardon if it is messy as I just started learning R). So I have managed to get 24 desired countries but will definitely want to narrow it further but would like to view the graph of the countries chosen against the years 2015:2019 and show their gdp growth.
So I keep getting the error such as Error in dimnames(x) <- dnx : 'dimnames' applied to non-array and Error in dataframe.
I am not sure what I am suppose to do after library (tidyr). I tried many options online but I feel I am doing something error prior to this code.
data1 <-WDI(indicator= c("IT.NET.USER.ZS", "BX.KLT.DINV.CD.WD", "IT.NET.SECR.P6" , "NY.GDP.MKTP.KD.ZG"), start = 2015, end = 2019, extra = FALSE)
#get column names
colnames(data1)
#View data
data1 %>%
view()
#Change column names
names(data1)[names(data1) == "IT.NET.USER.ZS"] <- "internet_users"
names(data1)[names(data1) == "BX.KLT.DINV.CD.WD"] <- "foreign_direct_investment"
names(data1)[names(data1) == "NY.GDP.MKTP.KD.ZG"] <- "gdp_growth"
names(data1)[names(data1) == "IT.NET.SECR.P6"] <- "secure_internet"
summary(data1)
data1 %>%
count(internet_users, sort = TRUE)
data1 %>%
count(year, sort = TRUE)
data1 %>%
count(country, sort = TRUE)
view(data1)
gdp <- summary(data1$gdp_growth)
users <- summary(data1$internet_users)
fdi <- summary(data1$foreign_direct_investment)
secure <- summary(data1$secure_internet)
country <- summary(data1$country)
names(data1)
#Selecting range from Mean to 3rd Quartile of data1 for internet_users
data2 <- data1[ c(data1$internet_users < 76.56 & data1$internet_users > 52.54 & data1$year == 2019), ]
summary(data2)
#Selecting gdp growth >= Mean of gdp growth 2.470
data3 <- data2[ c(data2$gdp_growth >= 2.4451),]
is.na(data3)
na.omit(data3)
view(data3)
#Removal of non country data from data3
data4 <- data3[-c(1,4,9,17,19,20,24,25,29,30,33,34,35,36,38,39,40,42,43,44),]
view(data4)
countries_1 <- c(data4$country)
head(countries_1)
summary(countries_1)
#trynna plot something that works with year/country and gdp
yearchoice <- c(year = 2015:2019)
str(data4)
datalinegraph <- data.frame(c(yearchoice,countries_1))
unique(data4$country)
**#listing the countries I think I need to plat against**
data5 <- data1 %>%
filter(country %in% countries_1)
library(tidyr)
data6 <-data.frame(data5)
data6.df$gdp = rownames(data5)
df.long = gather(data = data5,
key = yearchoice,
value = gdp)
ggplot(data = df.long, aes(x = yearchoice,
y = gdp,
group=data5,
color=variable)) +
geom_line() +
geom_point()```

Here's one that's country variables within country:
data1 <-WDI(indicator= c("IT.NET.USER.ZS", "BX.KLT.DINV.CD.WD", "IT.NET.SECR.P6" , "NY.GDP.MKTP.KD.ZG"), start = 2015, end = 2019, extra = FALSE)
#Change column names
names(data1)[names(data1) == "IT.NET.USER.ZS"] <- "internet_users"
names(data1)[names(data1) == "BX.KLT.DINV.CD.WD"] <- "foreign_direct_investment"
names(data1)[names(data1) == "NY.GDP.MKTP.KD.ZG"] <- "gdp_growth"
names(data1)[names(data1) == "IT.NET.SECR.P6"] <- "secure_internet"
#Selecting range from Mean to 3rd Quartile of data1 for internet_users
data2 <- data1 %>% filter(internet_users < 76.56 &
internet_users > 52.54 &
year == 2019 &
gdp_growth >= 2.4451)
#Removal of non country data from data3
data4 <- data2 %>% filter(!iso2c %in% c("4E", "T4", "V3", "XT", "Z4"))
countries_1 <- c(data4$country)
#listing the countries I think I need to plat against**
data5 <- data1 %>%
filter(country %in% countries_1)
r01 <- function(x){
x <- x-min(x, na.rm=TRUE)
x/max(x, na.rm=TRUE)
}
df.long = data5 %>%
mutate(across(internet_users:gdp_growth, r01)) %>%
pivot_longer(internet_users:gdp_growth,
names_to="variables",
values_to="vals")
ggplot(data = df.long, aes(x = year,
y = vals,
color=variables)) +
geom_line() +
facet_wrap(~country) +
theme(legend.position = "top")
And here's one that's countries within variables:
df.long2 = data5 %>%
pivot_longer(internet_users:gdp_growth,
names_to="variables",
values_to="vals")
ggplot(data = df.long2, aes(x = year,
y = vals,
color=country)) +
geom_line() +
facet_wrap(~variables, scales="free_y") +
theme(legend.position = "top")

Related

For Loop Efficiency

The following loop is effective in that it gets me to the finish line but i'm looking for a way to make it more efficient as I'm looping through a large dataset. Possibly using a Purrr function?
library(tidyverse)
library(timetk)
#### CREATE DATA
df_1 <- data.frame(Date = seq.Date(as.Date("2016-01-01"), length.out = 36, by = "month"),
Inventory = round(runif(36,5,100),0),
Purchases = round(runif(36,5,100),0),
Sales = round(runif(36,5,100),0),
Ending_Inventory = round(runif(36,5,100),0)) %>%
mutate(Starting_Inventory = lag(Ending_Inventory,1)) %>%
mutate(product = "Product_1")
df_2 <- data.frame(Date = seq.Date(as.Date("2016-01-01"), length.out = 36, by = "month"),
Inventory = round(runif(36,5,100),0),
Purchases = round(runif(36,5,100),0),
Sales = round(runif(36,5,100),0),
Ending_Inventory = round(runif(36,5,100),0)) %>%
mutate(Starting_Inventory = lag(Ending_Inventory,1)) %>%
mutate(product = "Product_2")
df <- rbind(df_1, df_2) %>%
group_by(product) %>%
timetk::future_frame(
.date_var = Date,
.length_out = "12 months",
.bind_data = TRUE
)
Here I'm creating a date sequence to iterate through the for loop
#### CREATE DATE SEQUENCE
Dates <- seq(min(df$Date) %m+% months(36), min(df$Date) %m+% months(48), by = "month")
The dates from the sequence above will iterate through the loop to fill in the future data and then I join, rename some columns, and drop all that contain ("y")... Seems like I'm performing some steps that aren't necessary.
for (i in 1:length(Dates)){
df <- df %>%
mutate(Purchases = case_when(Date < Dates[i] ~ Purchases,
Date == Dates[i] ~ lag(Purchases, 12)*1.05,
TRUE ~ 0
)) %>%
mutate(Starting_Inventory = case_when(Date < Dates[i] ~ Starting_Inventory,
Date == Dates[i] ~ lag(Ending_Inventory,1),
TRUE ~ 0
)) %>%
mutate(Sales = case_when(Date < Dates[i] ~ Sales,
Date == Dates[i] ~ lag(Sales,12) * 1.15,
TRUE ~ 0
)) %>%
mutate(Ending_Inventory = case_when(Date < Dates[i] ~ Ending_Inventory,
Date == Dates[i] ~ Starting_Inventory + Sales + Purchases,
TRUE ~ 0
)) %>%
mutate(Inventory = case_when(Date < Dates[i] ~ Inventory,
Date == Dates[i] ~ Ending_Inventory,
TRUE ~ 0
))
new_data <- df[df$Date == (Dates[i]),]
df <- df %>%
left_join(., new_data, by = c("product", "Date")) %>%
mutate(Inventory.x = ifelse(Date == Dates[i],Inventory.y,Inventory.x),
Purchases.x = ifelse(Date == Dates[i],Purchases.y,Purchases.x),
Sales.x = ifelse(Date == Dates[i],Sales.y,Sales.x),
Starting_Inventory.x = ifelse(Date == Dates[i],Starting_Inventory.y,Starting_Inventory.x),
Ending_Inventory.x = ifelse(Date == Dates[i],Ending_Inventory.y,Ending_Inventory.x),
) %>%
rename(Inventory = Inventory.x,
Purchases = Purchases.x,
Starting_Inventory = Starting_Inventory.x,
Sales = Sales.x,
Ending_Inventory = Ending_Inventory.x) %>%
dplyr::select(-contains(".y"))
return
print(i)
gc()
}
There are a lot of unnecessary steps in there.
Mutate can take more than one expression at once.
The case_when is unnecessary since in the next step you only keep the rows that got modified.
Then, for the same reason, the join and renaming is more steps than needed, you can just replace the old rows with the new row by selecting a subset.
for (i in seq_along(Dates)){
new_data <- df2 %>%
mutate(Purchases = lag(Purchases, 12)*1.05,
Starting_Inventory = lag(Ending_Inventory,1),
Sales = lag(Sales,12) * 1.15,
Ending_Inventory = Starting_Inventory + Sales + Purchases,
Inventory = Ending_Inventory)
df2[df2$Date == Dates[i],] <- new_data[new_data$Date == Dates[i],]
}
But then you're stil recalculating your whole data.frame for each loop. No need for that either since mutate() is iterative. You can do it all with just that function.
Also, since there are only 2 conditions really needed, you can replace the case_when with ifelse and it's faster.
df <- df %>%
mutate(
Purchases = ifelse(
Date %in% Dates, lag(Purchases, 12)*1.05, Purchases
),
Starting_Inventory = ifelse(
Date %in% Dates, lag(Ending_Inventory,1), Starting_Inventory
),
Sales = ifelse(
Date %in% Dates, lag(Sales,12) * 1.15, Sales
),
Ending_Inventory = ifelse(
Date %in% Dates, Starting_Inventory + Sales + Purchases,
Ending_Inventory
),
Inventory = ifelse(
Date %in% Dates, Ending_Inventory, Inventory
)
)
Edit:
I think it's important to break down what you're trying to do when you end up with long for loop like this. Since you're trying to do in place modifications, even in base R, you could do this with this short a for loop :
df3 <- df.o
df3 <- df3 |> within({
for (i in which(Date %in% Dates)){
Purchases[i] = Purchases[i-12]*1.05
Sales[i] = Sales[i-12] * 1.15
Ending_Inventory[i] = Starting_Inventory[i] + Sales[i] + Purchases[i]
Inventory[i] = Ending_Inventory[i]
Starting_Inventory[i] = Ending_Inventory[i-1]
}
i = NULL
})
A bit slower than mutate, but it's the same logic.

Why doesn't mutate function generate variable?

the code is the following:
tb <- tibble(
year = rep(2001:2020,10)
)
tb %>% arrange(year) %>%
mutate(
id = rep(1:10,20),
r1 = rnorm(200,0,1),
r2 = rnorm(200,1,1),
r3 = rnorm(200,2,1)
)
for (i in 1:5) {
tb %>% mutate(
T_i = r1*(year = 2004 + i) +
r2*(year = 2007 + i) +
r3*(year = 2009 + i)
)
}
colnames(tb)
I hope to see 6 variables after colnames(tb): "year" "id" "r1" "r2" "r3" "T_i" but it seems the for loop doesn't work. So I can only get the first five variables while T_i does not show up. I don't know why.
When using the pipe (%>%) operator, you haven't yet assigned tb to its new value. You are currently simply printing what tb would look like after applying the pipe.
The correct edit would be:
tb <- tibble(year = rep(2001:2020,10))
tb <- tb %>% arrange(year) %>%
mutate(id = rep(1:10,20),
r1 = rnorm(200,0,1),
r2 = rnorm(200,1,1),
r3 = rnorm(200,2,1))
for (i in 1:5) {
tb <- tb %>% mutate(
T_i = r1*(year = 2004 + i) +
r2*(year = 2007 + i) +
r3*(year = 2009 + i))
}
colnames(tb)
In general, the following simply prints:
dat %>% mutate(...)
The following assigns a new value to the variable:
dat <- dat %>% mutate(...)
And you need to do both together if you wish to reassign the value and print:
dat %>% mutate(...)
dat
If you want to do both in one line, also try:
(dat %>% mutate(...))

group “weighted” rolling mean while excluding own group value when a group has multiple observations

I'm trying to calculate group “weighted” rolling mean while excluding own group value when a group has multiple observations. This is related to my earlier question group "weighted" mean with multiple grouping variables and excluding own group value. The key difference is that this method is not readily applicable since now a group has multiple observations.
Based on the following dataset, here's the operation I want to apply. For instance, the new variable for the first two rows will take 19*9/18 + 48*3/18 + 6*2/18 + 31*4/18 = 25.05. The next two rows will take 81*1/10 + 52*3/10 + 6*2/10 + 31*4/10 = 37.3, and so on.
set.seed(57)
df <- data.frame(
state = rep(c("AL", "CA"), each = 12),
year = rep(c(2011:2012), 12),
county = rep(letters[1:6], each = 4),
value = sample(100, 24),
wt = sample(10, 24, replace = T)
) %>% arrange(state, year)
If I apply the following code, the issue is that observation from the same county is also included in the weighted mean formula.
df %>%
group_by(state, year) %>%
mutate(new_val = purrr::map_dbl(row_number(),
~weighted.mean(value[-.x], wt[-.x])))
As a get around, I've tried the following (find weighted mean within a county-year first and apply the code above), but the two are not producing the same results, tho somewhat similar.
df %>%
group_by(state, county, year) %>%
mutate(wp = weighted.mean(value, wt),
wt2 = sum(wt)) %>%
distinct(state, year, county, wp, wt2) %>%
ungroup() %>%
group_by(state, year) %>%
mutate(new_val = purrr::map_dbl(row_number(),
~weighted.mean(wp[-.x], wt2[-.x])))
Thank you for taking the time to read this!
I found an answer, but I'm sure that this is not the best approach. Any other suggestions would be very helpful for future reference.
x <- c(rep(c(letters[1:3]), 2), rep(c(letters[4:6]), 2))
year <- rep(rep(c(2011:2012), each = 3), time = 2)
state <- rep(c("AL", "CA"), each = 6)
get_wv <- function(x, year, state){
new_val <- weighted.mean(df$value[df$county != x & df$year == year & df$state == state],
df$wt[df$county != x & df$year == year & df$state == state])
new_val
}
res <- pmap(.l = list(x, year, state), .f = get_wv)

How to create a table with 1st row and 1st column as the header?

I want to create a data table in R with some data that I had already obtained. However, I'm not sure how to put those data into a table form because that required some skill to put he return data, monthlyRet, into the table according to their month respectively. The diagram attached below is the table format that I want, the data inside also need to be included.
Please note that the data for No.of.Positive and No.of.Negative are started from Aug instead of Jan due to the starting date in getSymbols. Hence, I wish the No.of.Positive and No.of.Negative can be arranged in the table created from Jan to Dec as shown in the diagram below.
The code below is how I obtained my data.
library(quantmod)
prices <-
getSymbols("^NDX", src = 'yahoo', from = "2009-07-01", to = "2019-08-01",
periodicity = "monthly", auto.assign = FALSE, warnings = FALSE)[,4]
return <- diff(log(prices))
r <- na.omit(exp(return)-1)
monthlyRet <- as.numeric(r)
meanMonthlyRet <- c()
No.of.Positive <- c()
No.of.Negative <- c()
for (j in 1:12){
Group <- c()
count_pos=0
count_neg=0
for (i in seq(j,length(monthlyRet),12)){
Group[i] <- monthlyRet[i]
if(monthlyRet[i]>0){
count_pos <- count_pos+1
}
else if(monthlyRet[i]<0){
count_neg <- count_neg+1
}
}
meanMonthlyRet[j] <- mean(Group, na.rm=TRUE)
Positive=0
Negative=0
if(meanMonthlyRet[j]>0){
Positive=count_pos
Negative=10-Positive
}
else if (meanMonthlyRet[j]<0){
Negative=count_neg
Positive=10-Negative
}
No.of.Positive[j] <- Positive
No.of.Negative[j] <- Negative
}
# My data required in table #--------------------------------------------------
Year <- c(2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019)
Month <- c("Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr","May","Jun","Jul")
r
No.of.Positive
No.of.Negative
I hope I can obtain exactly the same table format and content as the diagram below (I manually created in excel). Further, if the start and end date in getSymbols are changed, I hope the data in the table will still be correct.
Here is a tidyverse solution for your problem.
library(quantmod)
library(tidyverse)
prices <- getSymbols("^NDX", src = 'yahoo', from = "2009-07-01",
to = "2019-08-01", periodicity = "monthly",
auto.assign = FALSE, warnings = FALSE)[,4]
r <- prices %>%
log %>%
diff %>%
exp %>%
{. - 1}
table_out <- r %>%
as.data.frame() %>%
rownames_to_column() %>%
set_names(c("date", "variable")) %>%
mutate(variable = (variable * 100) %>% round(2)) %>%
separate(date, c("year", "month", "day")) %>%
select(-day) %>%
spread(month, variable)
n_pos <- map_dbl(table_out, ~sum(. > 0, na.rm = T))
n_neg <- map_dbl(table_out, ~sum(. < 0, na.rm = T))
table_out <- table_out %>%
mutate_if(is.double, ~str_c(., "%")) %>%
rbind(n_pos, n_neg)
x <- nrow(table_out)
table_out[(x-1):x, "year"] <- c("No. of Positive","No. of Negative")
table_out

R: Return the first "n" rows and group the remaining rows into "Other" row and summarise the column

I'm Brazilian, sorry about my english!
I would like to know if there is an function implemented in some R package to filter first "n" rows and group the remaining into an "Other" row and summarise the column.
Here is below an example of what I want:
library(tidyverse)
library(plotly)
library(scales)
data("lakers")
x = bind_rows(
lakers %>% count(player) %>% arrange(-n) %>% head(10),
lakers %>% count(player) %>% arrange(-n) %>% slice(11:n()) %>%
summarise(player = "Others", n = sum(n))) %>%
filter(!player == "") %>%
mutate(
player = factor(player, levels = rev(.$player)))
ggplot(x, aes(x=player, y=n))+
geom_col(fill = "DodgerBlue1", col = "DodgerBlue3")+
coord_flip()+
geom_text(aes(y=n, label = comma(n)),hjust = -.2)+
scale_y_continuous(limits = c(0, max( x$n*1.1 )))+
theme_minimal()
I need to create an ggplot like that. So I have a big query using dplyr and I don't want to repeat the query every time.
I would like some function like:
head.other(x, rows = 20, fun = sum, name = "Others")
Here is a function that I think will give you what you need:
library(tibble)
library(dplyr)
df <- data.frame(col1 = rnorm(10), col2 = rnorm(10)) # your data frame
n <- 6 # top n rows to keep
myfun <- function(df, n) {
# seperate keep rows and those to aggregate
preserve.df <- df[1:n, ]
summarise.df <- df[(n+1):nrow(df), ]
# create new df in required format
new.df <- bind_rows(preserve.df, sapply(summarise.df, sum))
# add a column to identify the rows and return
rownames(new.df) <- c(paste0("r", 1:n), "Other")
rownames_to_column(new.df)
}
myfun(df, 6)

Resources