It is hard to describe in words. Therefore, made a reprex
with input, output and expected output below
How can we data wrangle
1. When we function and mutate as shown below, there is ambiguity each time based on column name string
2. how can we rbind these once we have unique column names
library(tidyverse)
# Basically, "." means ",". So, better we remove . and PC and convert to Numeric
df1 <- tribble(
~`ABC sales 01.01.2019 - 01.02.2019`, ~code,
"1.019 PC", 2000, # Actually, it 1019 (remove . and PC )
"100 PC", 2101,
"3.440 PC", 2002
)
df2 <- tribble(
~`ABC sales 01.03.2019 - 01.04.2019`, ~year,
"6.019 PC", 2019,
"20 PC", 2001,
"043.440 PC", 2002
)
df3 <- tribble(
~`ABC sales 01.05.2019 - 01.06.2019`, ~year,
"1.019 PC", 2000,
"701 PC", 2101,
"6.440 PC", 2002
)
# Input data
input_df = list(df1,df2,df3)
#### function to clean data
# str_replace is used twice because
# remove PC and dot
data_read = function(file){
df_ <- df %>% #glimpse()
# Select the column to remove PC, spaces and .
# Each time, column name differs so, `ABC sales 01.01.2019 - 01.02.2019` cannot be used
mutate_at(sales_dot = str_replace(select(contains('ABC')), "PC",""),
sales = str_replace(sales_dot, "\\.",""), # name the new column so that rbind can be applied later
sales_dot = NULL, # delete the old column
vars(contains("ABC")) = NULL # delete the old column
)
df_
}
# attempt to resolve
# To clean the data from dots and PC
output_df1 <- map(input_df, data_read) # or lapply ?
# rbind
output = map(output_df1, rbind) # or lapply ?
expected_output <- df3 <- tribble(
~sales, ~year,
"1019", 2000,
"100", 2101,
"3440", 2002,
"6019", 2019,
"20", 2001,
"043440", 2002,
"1019", 2000,
"701", 2101,
"6440", 2002
)
Using purrr, dplyr and stringr, you can do:
map_df(.x = input_df, ~ .x %>%
set_names(., c("sales", "year"))) %>%
mutate(sales = str_remove_all(sales, "[. PC]"))
sales year
<chr> <dbl>
1 1019 2000
2 100 2101
3 3440 2002
4 6019 2019
5 20 2001
6 043440 2002
7 1019 2000
8 701 2101
9 6440 2002
Related
I have two data frames in these formats.
df1 <- data.frame (Year = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA",
"FAMILY BROS", "family, apple, ibm","family co.")
)
df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))
I need to create a data frame that shows each firm and its corresponding year of being a winner as illustrated in Data3 in the attached figure. I checked few links like this one Merge tables in R using like
where they use a like operator but am unable to create the desired data as there can be multiple winners in a year. Please suggest what functions should I try to create Data3. Thanks!
Figure - Desired Data Format
Using adist basically.
sp <- strsplit(df1$Winner, ',|;') |> lapply(trimws)
sp <- t(sapply(sp, `length<-`, max(lengths(sp)))) |> as.data.frame() |> cbind(Year=df1$Year)
sp <- reshape(sp, 1:3, idvar=4, direction='l', sep='') |> na.omit()
sp$Firm <- cutree(hclust(as.dist(adist(gsub('inc|co', '', tolower(sp$V))))), 4) |>
factor(labels=c('Apple', 'Sonata Inc.', 'Family Bros. Co.', 'IBM'))
subset(sp[order(sp$Firm), ], select=c(Firm, Year))
# Firm Year
# 1.1 Apple 1991
# 2.1 Apple 1992
# 3.1 Apple 1993
# 6.2 Apple 1996
# 4.1 Sonata Inc. 1994
# 5.1 Family Bros. Co. 1995
# 6.1 Family Bros. Co. 1996
# 7.1 Family Bros. Co. 1997
# 3.2 IBM 1993
# 6.3 IBM 1996
Try this
df <- sapply(gsub("\\s[a-zA-Z]+\\W" , "" ,trimws(df2$Firm)),
function(x) grepl(tolower(x) ,
tolower(df1$Winner)))
l <- lapply(data.frame(df), function(x) df1$Year[x])
l
If you want the answer in data.frame use
ans <- data.frame(Firm = gsub("[0-9]+","",names(unlist(l))) ,
year = unlist(l))
row.names(ans) <- NULL
ans
Using fuzzyjoin.
(Use the second example only if the precise ordering matters.)
library(tidyverse)
library(fuzzyjoin)
# Data
df1 <- data.frame (Year = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA",
"FAMILY BROS", "family, apple, ibm","family co.")
)
df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))
# If the order is unimportant
df1_sep <- df1 |>
separate_rows(Winner) |>
filter(!Winner %in% c("", "CO.", "inc.", "co.", "INC.", "BROS"))
df2 |>
mutate(Firm = str_squish(Firm)) |>
regex_right_join(df1_sep, by = c("Firm" = "Winner"), ignore_case = TRUE) |>
arrange(Firm, Year) |>
select(-Winner)
#> Firm Year
#> 1 APPLE 1991
#> 2 APPLE 1992
#> 3 APPLE 1993
#> 4 APPLE 1996
#> 5 Family Bros. Co. 1995
#> 6 Family Bros. Co. 1996
#> 7 Family Bros. Co. 1997
#> 8 IBM 1993
#> 9 IBM 1996
#> 10 Sonata Inc. 1994
# If desired output order matters
df1_sep <- df1 |>
separate_rows(Winner) |>
filter(!Winner %in% c("", "CO.", "inc.", "co.", "INC.", "BROS"))
df2 |>
mutate(Firm = str_squish(Firm)) |>
regex_right_join(df1_sep, by = c("Firm" = "Winner"), ignore_case = TRUE) |>
group_by(Firm) |>
mutate(sort = min(Year)) |>
ungroup() |>
arrange(sort, Year) |>
select(-Winner, -sort)
#> # A tibble: 10 × 2
#> Firm Year
#> <chr> <dbl>
#> 1 APPLE 1991
#> 2 APPLE 1992
#> 3 APPLE 1993
#> 4 APPLE 1996
#> 5 IBM 1993
#> 6 IBM 1996
#> 7 Sonata Inc. 1994
#> 8 Family Bros. Co. 1995
#> 9 Family Bros. Co. 1996
#> 10 Family Bros. Co. 1997
Created on 2022-06-18 by the reprex package (v2.0.1)
Base R, sure a simpler solution exists:
# Split each winning company up into separate elements in a list
# of character vectors: winning_companies => list of character vectors
winning_companies <- strsplit(
df1$Winner,
"\\;|\\,"
)
# Unroll the data.frame: df1_unrolled => data.frame
df1_unrolled <- data.frame(
do.call(
rbind,
lapply(
seq_len(nrow((df1))),
function(i){
transform(
df1[rep(i, length(winning_companies[[i]])),],
Winner = trimws(unlist(winning_companies[[i]]), "both")
)
}
)
),
stringsAsFactors = FALSE,
row.names = NULL
)
# Clean up the search terms: firm_names_std => character vector
df2$firm_names_std <- trimws(
gsub(
"\\w+\\.",
"",
tolower(
df2$Firm
)
),
"both"
)
# Resolve a dictionary to be used to lookup items:
# firm_dictionary => character vector
firm_dictionary <- names(
sort(
table(
df2$firm_names_std
),
decreasing = TRUE
)
)
# Function to correct the spelling: correct_spelling => function
correct_spelling <- function(firm_name_vec, firm_dictionary, similarity_threshold = NULL) {
# Derive the similarity threshold: st => integer scalar
st <- similarity_threshold
# Clean the words: firm_name => string scalar
clean_firm_names <- trimws(
gsub(
"\\w+\\.",
"",
tolower(
firm_name_vec
)
),
"both"
)
# Function to correct the spelling at a scalar level:
# .correct_spelling_scalar => function
.correct_spelling_scalar <- function(firm_name, firm_dictionary, similarity_threshold = st){
# Calculate the levenshtein distance between the cleaned word
# and each element in the dictionary: distance_from_dict => double vector
distance_from_dict <- adist(firm_name, firm_dictionary, partial = TRUE)
# If we are not using a similarity threhold:
if(is.null(similarity_threshold)){
# Resolve the intermediate result: ir => character scalar
ir <- firm_dictionary[which.min(distance_from_dict)]
# Otherwise:
}else{
# Count the number of characters of each element in the dictionary
# vector: n => integer vector
n <- nchar(firm_dictionary)
# Calculate the ratio between the number of characters differing between
# each term in the dictionary and the total of number characters
# for a given dictionary element: dist_ratio => double vector
dist_ratio <- distance_from_dict / n
# Check if distance in ratio form is within the threshold:
# selection_idx => logical vector
selection_idx <- dist_ratio <= similarity_threshold
# Resolve the intermediate result: ir => character scalar
ir <- firm_dictionary[selection_idx]
}
# Resolve company name: res => string scalar
res <- head(
c(
ir,
NA_character_
),
1
)
# Explicitly define the returned object: character scalar => env
return(res)
}
# Apply function to a vector: res => character vector
res <- vapply(
clean_firm_names,
function(x){
.correct_spelling_scalar(x, firm_dictionary)
},
character(1),
USE.NAMES = FALSE
)
# Explicitly define the returned object: character vector => env
return(res)
}
# Derive the correct spelling of the firms:
# cleaned_firm_names => character vector
cleaned_firm_names <- correct_spelling(
df1_unrolled$Winner,
firm_dictionary
)
# Use the cleaned firm names to look up the formatted names in df2:
# df3 => data.frame
df3 <- transform(
df1_unrolled,
Winner = trimws(
df2$Firm[match(cleaned_firm_names, df2$firm_names_std)],
"both"
)
)
# Output result to console: data.frame => stdout(console)
df3
Data:
df1 <- data.frame (Year = c(1991, 1992, 1993, 1994, 1995, 1996, 1997),
Winner = c("APPLE ", "apple inc.", "APPLE INC.; IBM CO.", "SONATA",
"FAMILY BROS", "family, apple, ibm","family co.")
)
df2 <- data.frame (Firm = c("APPLE ", "IBM", "Sonata Inc.","Family Bros. Co."))
I have the following dataframe:
library(dplyr)
library(tidyverse)
library(concordance)
Year <- c(2016,2016,2017,2019,2020,2020,2020,2013,2010,2010)
Pf <- c("HS4","HS4","HS4","HS5","HS5","HS5","HS5","HS4","HS3","HS3")
Code <- c("391890","440929","851660","732399","720839","050510","830241","321590","010210","010210")
Slen <- c("6","6","6","6","6","6","6","6","6","6")
df <- data.frame(Year,Pf,Code,Slen)
'Pf' column contains 3 different types of rows: "HS3", "HS4" and "HS5". I want to perform a vectorized operation and apply concord() function to the 'Code' column", however in order to do that, 'Pf' must be Unique that's why before I sebset datarames where 'Pf' column is unique
# Subset data where Pf column is unique
df.H5 <- subset(df, Pf == "HS5")
df.H4 <- subset(df, Pf == "HS4")
df.H3 <- subset(df, Pf == "HS3")
Now I apply a function to each dataframe. Here concord() function applies to 'Code' column and converts these characters to different ones. However, if destination (argument) and values in 'Pf' column are the same it does not work, for instance, if Pf="HS3" (in df) and destination = "HS3", the code does not run, that's why I don't apply code to df.H3
# Apply function to df.H5
df.H5<- df.H5 %>%
group_by(Pf, Slen) %>%
mutate(
Code2 = concord(Code, origin = unique(Pf), dest.digit = unique(Slen), destination = "HS3", all = FALSE)
) %>%
ungroup()
# Apply function to df.H4
df.H4<- df.H4 %>%
group_by(Pf, Slen) %>%
mutate(
Code2 = concord(Code, origin = unique(Pf), dest.digit = unique(Slen), destination = "HS3", all = FALSE)
) %>%
ungroup()
#add column todf.H3 in order to merge these 3 tafarames
df.H3$Code2 <- df.H3$Code
#merge
df2 <- rbind(df.H4, df.H5, df.H3)
My goal is to somehow automate the process. For instance, if destination = "HS3", the code applies whole data without pre-subsetting and if destination (argument) and rows in Pf match each other, the code does not apply to it and just copy-paste values from 'Code' to generated 'Code2' column in that case
You could put the logic in a function and use it in a by approach which splits data and applies functions. In the function you could do a case handling where supposedly P == 'HS3' should not be processed. Finally unsplit.
cf <- \(x) {
Code2 <- if (!any(x$Pf == 'HS3')) {
concordance::concord(x$Code, x$Pf[1], x$Slen[1],
destination="HS3", all=FALSE)
} else {
x$Code
}
cbind(x, Code2)
}
by(df, df$Pf, cf) |>
unsplit(df$Pf)
# Year Pf Code Slen Code2
# 1 2016 HS4 391890 6 391890
# 2 2016 HS4 440929 6 440929
# 3 2017 HS4 851660 6 851660
# 4 2019 HS5 732399 6 732399
# 5 2020 HS5 720839 6 720839
# 6 2020 HS5 050510 6 050510
# 7 2020 HS5 830241 6 830241
# 8 2013 HS4 321590 6 321590
# 9 2010 HS3 010210 6 010210
# 10 2010 HS3 010210 6 010210
Data:
df <- structure(list(Year = c(2016, 2016, 2017, 2019, 2020, 2020, 2020,
2013, 2010, 2010), Pf = c("HS4", "HS4", "HS4", "HS5", "HS5",
"HS5", "HS5", "HS4", "HS3", "HS3"), Code = c("391890", "440929",
"851660", "732399", "720839", "050510", "830241", "321590", "010210",
"010210"), Slen = c("6", "6", "6", "6", "6", "6", "6", "6", "6",
"6")), class = "data.frame", row.names = c(NA, -10L))
I wrote this code and used
library('fastDummies'):
New_Data <- dummy_cols(New_Curve_Data, select_columns = 'CountyName')
I just want the actual county name that is Banks to be displayed and not CountyName_Banks etc.
There are like 100 dummy variables that I created. So I cant manually change the names.
The prefix substring in the column names can be removed with sub by matching the 'CountyName_' as pattern and replace it with blank ("") on the names of the 'New_Data'` and assign it back
names(New_Data) <- sub("CountyName_", "", names(New_Data))
This can be also done in base R with table
as.data.frame.matrix(table(seq_len(nrow(New_Curve_Data)),
New_Curve_Data$CountyName))
You can use pivot_wider. Since you did not share an example data taking example from ?fastDummies::dummy_cols help page.
crime <- data.frame(city = c("SF", "SF", "NYC"),
year = c(1990, 2000, 1990),
crime = 1:3)
tidyr::pivot_wider(crime, names_from = city, values_from = city,
values_fn = length, values_fill = 0)
# year crime SF NYC
# <dbl> <int> <int> <int>
#1 1990 1 1 0
#2 2000 2 1 0
#3 1990 3 0 1
I'm doing trend analysis, and trying to use barcharts to visualize the frequencies of the hashtags in different years. So I can see the top 3 most frequent hashtag terms, and see how the frequencies of these terms are evolving during years I have a dataset like this:
terms year
1 #A;#B;#C 2017
2 #B;#C;#D 2016
3 #C;#D;#E 2021
4 #D;#E;#F 2020
5 #E;#F;#G 2020
6 #F;#G;#H 2020
7 #G;#H;#I 2019
8 #H;#I;#J 2018
9 #I;#J;#K 2020
10 #J;#K;#L 2020
thanks!
Basically, we need to count the hashtag for every year. Since the hashtags for a particular year is in single-column we need to separate it into different columns and then we can convert the df into a long df, where it becomes possible for us to group it based on year and hashtag to find the count.
library(tidyverse)
structure(list(terms = c("#A;#B;#C", "#B;#C;#D", "#C;#D;#E",
"#D;#E;#F", "#E;#F;#G", "#F;#G;#H", "#G;#H;#I", "#H;#I;#J", "#I;#J;#K",
"#J;#K;#L"), year = c(2017, 2016, 2021, 2020, 2020, 2020, 2019,
2018, 2020, 2020)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame")) -> df
df %>%
separate(terms, into = paste0("t", 1:3), sep = ";") %>%
pivot_longer(-year) %>%
group_by(year, value) %>%
count(value) %>%
ggplot(aes(x = year, y = n, fill = value, label = n)) +
geom_col(position = position_dodge()) +
geom_text(position = position_dodge(1))
Created on 2021-02-05 by the reprex package (v0.3.0)
To generate a nicely readable plot for each year in Base R, we can do the following:
Code
# First create a list of data.frames that we can utilize to plot
# Split by year
listdf <- split(df, df$year)
# Only choose trends and name list according to year
listdf <- lapply(listdf, "[[", 1)
# Stringsplit to obtain trends as vector for each year
listdfplot <- lapply(listdf, function(x){
unlist(strsplit(x, ";"))
})
# Plot
# Plot side by side
par(mfrow = c(2, 3))
# A barplot for each year
Map(function(x, y){
barplot(table(x), main = paste("Trends in", y), las = 2)
},
listdfplot,
names(listdfplot))
Data
df <- structure(list(terms = c("#A;#B;#C", "#B;#C;#D", "#C;#D;#E",
"#D;#E;#F", "#E;#F;#G", "#F;#G;#H", "#G;#H;#I", "#H;#I;#J", "#I;#J;#K",
"#J;#K;#L"), year = c(2017, 2016, 2021, 2020, 2020, 2020, 2019,
2018, 2020, 2020)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
You could make a new Data frame were every hashtag gets a unique information for year.
After that you can play around with the data using geom_bar.
I can not upload the plot to this post as this is a new account.
library(tidyverse)
library(data.table)
#your Data:
#terms year
#1 #A;#B;#C 2017
#2 #B;#C;#D 2016
#3 #C;#D;#E 2021
#4 #D;#E;#F 2020
#5 #E;#F;#G 2020
#6 #F;#G;#H 2020
#7 #G;#H;#I 2019
#8 #H;#I;#J 2018
#9 #I;#J;#K 2020
#10 #J;#K;#L 2020
# make a df that looks like your data:
terms<- c("#A;#B;#C",
"#B;#C;#D",
"#C;#D;#E",
"#D;#E;#F",
"#E;#F;#G",
"#G;#H;#I",
"#H;#I;#J",
"#I;#J;#K",
"#J;#K;#L")
terms<-as.data.frame(terms)
year<-c(2017,2016,2021,2020,2020,2019,2018,2020,2020)
year<-as.data.frame(year)
df<-cbind(terms,year)
# read your data from what I assume is your Data frame
terms<-c(df$terms)
year.list<-c(df$year)
loopcount<-length(terms)
# make new dummys
year<-c()
hashtags<-c()
all.years<-as.data.frame(hashtags,year)
#split hashtags based on ";"
hashtag.list<-str_split(terms, ";")
With this loop you create a new DF
# make new df were every hashtags gets the information for year
for (i in 1:loopcount){
hashtags<-hashtag.list[[i]]
hashtags<-as.data.frame(hashtags)
year<-c()
for(k in 1:nrow(hashtags)) {
year[k]<- year.list[i]
}
year<-as.data.frame(year)
one.year<-cbind(hashtags,year)
all.years<-rbind(all.years,one.year)
}
hashtagDF<-all.years
head(hashtagDF)
The new DF can then be used to to plot what you want
Or
if I understand you correct
you can make a new df were the frequency of the hashtags per year are shown
and only the top 3 hashtags are included
#only include the three most used hashtags per year
# dummys for new df
hashtags<-c()
year<-c()
Freq<-c()
top.3<-as.data.frame(hashtags,year,Freq)
years.in.study<-unique(hashtagDF$year)
#i<-3
for ( i in 1: length(years.in.study)){
what.year<-paste(years.in.study[i])
#subset per year
one.subset<-subset(hashtagDF, year == what.year)
# calculate how often a hashtag is present per year
freq<-table(one.subset)
frequency.per.year<-as.data.frame(freq)
frequency.per.year<-frequency.per.year[order(-frequency.per.year[,3]), ]
# only keep the 3 most occurring terms
lenght.of.file.to.delete<-nrow(frequency.per.year)
if (nrow(frequency.per.year) == 3){
lenght.of.file.to.delete<-lenght.of.file.to.delete+1
}
frequency.per.year<-frequency.per.year[-c(4:lenght.of.file.to.delete), ]
# make a df with all years
top.3<-rbind(top.3,frequency.per.year)
}
top.3
#order for year
top.3$year<-as.character(top.3$year)
top.3[order(top.3[,2]), ]
#year should be a factor
top.3$yearF<-as.factor(top.3$year)
Then you can plot it
# plot as barplot
# with
# the frequencies of the hashtags in different years.
# the top 3 most frequent hashtag terms per year
barplot<-ggplot(data=top.3, aes(x=yearF, y=Freq,fill=hashtags)) +
geom_bar(stat="identity")+
labs(title = "",
subtitle = "",
caption = "",
x= "",
y= "")
barplot
ggsave(file="hashtag.png", barplot, width = 210, height = 297, units = "mm")
I've got a df with country-level data entered in 2003.
Several rows of data belong to a country named 'Federal Republic of Yugoslavia'.
These are two separate countries today and I want to duplicate these rows of data so that I can rename each set of rows to its respective modern country name.
data.frame(Country = "Yugoslavia", Chickens = 567)
Using this minimal example, how do I create this dataframe?
data.frame(Country = c("Serbia", "Montenegro"), Chickens = 567)
you can do in one tidyverse pipe:
library(tidyverse)
df2 <- df %>%
mutate(Country = if_else(Country == "Yugoslavia", "Serbia", as.character(Country))) %>%
bind_rows(df) %>%
mutate(Country = if_else(Country == "Yugoslavia", "Montenegro", as.character(Country)))
You could also use mutate_if instead of the if_else statements.
Country Chickens
1 Serbia 567
2 Montenegro 567
By default data.frame turns character columns into factors. The substitution above coerces into character.
If you want to preserve the factor class then just add:
%>% mutate(Country = as.factor(Country))
... at the end.
You can do something like this:
data2<-data[data$country=="Yugoslavia"]
levels(data2$country)[levels(data2$country)=="Yugoslavia"]<-"Serbia"
levels(data$country)[levels(data$country)=="Yugoslavia"]<-"Montenegro"
rbind(data,data2)
You can write a function which returns the duplicated and renamed rows like:
fun <- function(y) {
if(y[["Country"]] == "Yugoslavia") rbind(replace(y, "Country", "Serbia")
, replace(y, "Country", "Montenegro"))
else y
}
do.call("rbind", apply(x, 1, fun))
# Country Chickens
#[1,] "Italy" " 2"
#[2,] "Serbia" "567"
#[3,] "Montenegro" "567"
#[4,] "Austria" " 3"
Or if order does not matter:
rbind(x[x$Country != "Yugoslavia",]
, replace(x[x$Country == "Yugoslavia",], "Country", "Serbia")
, replace(x[x$Country == "Yugoslavia",], "Country", "Montenegro"))
# Country Chickens
#1 Italy 2
#3 Austria 3
#2 Serbia 567
#21 Montenegro 567
Data:
x <- data.frame(Country = c("Italy","Yugoslavia","Austria"), Chickens = c(2,567,3))
x
# Country Chickens
#1 Italy 2
#2 Yugoslavia 567
#3 Austria 3