I have the following dataframe:
library(dplyr)
library(tidyverse)
library(concordance)
Year <- c(2016,2016,2017,2019,2020,2020,2020,2013,2010,2010)
Pf <- c("HS4","HS4","HS4","HS5","HS5","HS5","HS5","HS4","HS3","HS3")
Code <- c("391890","440929","851660","732399","720839","050510","830241","321590","010210","010210")
Slen <- c("6","6","6","6","6","6","6","6","6","6")
df <- data.frame(Year,Pf,Code,Slen)
'Pf' column contains 3 different types of rows: "HS3", "HS4" and "HS5". I want to perform a vectorized operation and apply concord() function to the 'Code' column", however in order to do that, 'Pf' must be Unique that's why before I sebset datarames where 'Pf' column is unique
# Subset data where Pf column is unique
df.H5 <- subset(df, Pf == "HS5")
df.H4 <- subset(df, Pf == "HS4")
df.H3 <- subset(df, Pf == "HS3")
Now I apply a function to each dataframe. Here concord() function applies to 'Code' column and converts these characters to different ones. However, if destination (argument) and values in 'Pf' column are the same it does not work, for instance, if Pf="HS3" (in df) and destination = "HS3", the code does not run, that's why I don't apply code to df.H3
# Apply function to df.H5
df.H5<- df.H5 %>%
group_by(Pf, Slen) %>%
mutate(
Code2 = concord(Code, origin = unique(Pf), dest.digit = unique(Slen), destination = "HS3", all = FALSE)
) %>%
ungroup()
# Apply function to df.H4
df.H4<- df.H4 %>%
group_by(Pf, Slen) %>%
mutate(
Code2 = concord(Code, origin = unique(Pf), dest.digit = unique(Slen), destination = "HS3", all = FALSE)
) %>%
ungroup()
#add column todf.H3 in order to merge these 3 tafarames
df.H3$Code2 <- df.H3$Code
#merge
df2 <- rbind(df.H4, df.H5, df.H3)
My goal is to somehow automate the process. For instance, if destination = "HS3", the code applies whole data without pre-subsetting and if destination (argument) and rows in Pf match each other, the code does not apply to it and just copy-paste values from 'Code' to generated 'Code2' column in that case
You could put the logic in a function and use it in a by approach which splits data and applies functions. In the function you could do a case handling where supposedly P == 'HS3' should not be processed. Finally unsplit.
cf <- \(x) {
Code2 <- if (!any(x$Pf == 'HS3')) {
concordance::concord(x$Code, x$Pf[1], x$Slen[1],
destination="HS3", all=FALSE)
} else {
x$Code
}
cbind(x, Code2)
}
by(df, df$Pf, cf) |>
unsplit(df$Pf)
# Year Pf Code Slen Code2
# 1 2016 HS4 391890 6 391890
# 2 2016 HS4 440929 6 440929
# 3 2017 HS4 851660 6 851660
# 4 2019 HS5 732399 6 732399
# 5 2020 HS5 720839 6 720839
# 6 2020 HS5 050510 6 050510
# 7 2020 HS5 830241 6 830241
# 8 2013 HS4 321590 6 321590
# 9 2010 HS3 010210 6 010210
# 10 2010 HS3 010210 6 010210
Data:
df <- structure(list(Year = c(2016, 2016, 2017, 2019, 2020, 2020, 2020,
2013, 2010, 2010), Pf = c("HS4", "HS4", "HS4", "HS5", "HS5",
"HS5", "HS5", "HS4", "HS3", "HS3"), Code = c("391890", "440929",
"851660", "732399", "720839", "050510", "830241", "321590", "010210",
"010210"), Slen = c("6", "6", "6", "6", "6", "6", "6", "6", "6",
"6")), class = "data.frame", row.names = c(NA, -10L))
Related
I have a data frame of postcodes with a regional/metro classification assigned. In some instances, due to the datasource, the same postcode will occur with both a regional and metro classification.
POSTCODE REGON
1 3000 METRO
2 3000 REGIONAL
3 3256 METRO
4 3145 METRO
I am wondering how to remove the duplicate row and replace the region with "SPLIT" in these instances.
I have tried using the below code however this reassignes the entire dataset with either "METRO" or "REGIONAL"
test <- within(PC_ACTM, REGION <- ifelse(duplicated("Postcode"), "SPLIT", REGION))
The desired output would be
POSTCODE REGON
1 3000 SPLIT
2 3256 METRO
3 3145 METRO
Example data:
dput(PC_ACTM)
structure(list(POSTCODE = c(3000L, 3000L, 3256L, 3145L), REGON = c("METRO",
"REGIONAL", "METRO", "METRO")), class = "data.frame", row.names = c("1",
"2", "3", "4"))
Based on your title, you're looking for an ifelse() solution; perhaps this will suit?
PC_ACTM <- structure(list(POSTCODE = c(3000L, 3000L, 3256L, 3145L),
REGION = c("METRO", "REGIONAL", "METRO", "METRO")),
class = "data.frame",
row.names = c("1", "2", "3", "4"))
PC_ACTM$REGION <- ifelse(duplicated(PC_ACTM$POSTCODE), "SPLIT", PC_ACTM$REGION)
PC_ACTM[!duplicated(PC_ACTM$POSTCODE, fromLast = TRUE),]
#> POSTCODE REGION
#> 2 3000 SPLIT
#> 3 3256 METRO
#> 4 3145 METRO
Created on 2022-04-07 by the reprex package (v2.0.1)
Consider ave to sequential count by group and then subset the last but before use ifslse to replace needed value for any group counts over 1. Below uses new base R 4.1.0+ pipe |>:
test <- within(
PC_ACTM, {
PC_SEQ <- ave(1:nrow(test), POSTCODE, FUN=seq_along)
PC_COUNT <- ave(1:nrow(test), POSTCODE, FUN=length)
REGION <- ifelse(
(PC_SEQ == PC_COUNT) & (PC_COUNT > 1), "SPLIT", REGION
)
}
) |> subset(
subset = PC_SEQ == PC_COUNT, # SUBSET ROWS
select = c(POSTCODE, REGION) # SELECT COLUMNS
) |> `row.names<-`(NULL) # RESET ROW NAMES
I'm doing trend analysis, and trying to use barcharts to visualize the frequencies of the hashtags in different years. So I can see the top 3 most frequent hashtag terms, and see how the frequencies of these terms are evolving during years I have a dataset like this:
terms year
1 #A;#B;#C 2017
2 #B;#C;#D 2016
3 #C;#D;#E 2021
4 #D;#E;#F 2020
5 #E;#F;#G 2020
6 #F;#G;#H 2020
7 #G;#H;#I 2019
8 #H;#I;#J 2018
9 #I;#J;#K 2020
10 #J;#K;#L 2020
thanks!
Basically, we need to count the hashtag for every year. Since the hashtags for a particular year is in single-column we need to separate it into different columns and then we can convert the df into a long df, where it becomes possible for us to group it based on year and hashtag to find the count.
library(tidyverse)
structure(list(terms = c("#A;#B;#C", "#B;#C;#D", "#C;#D;#E",
"#D;#E;#F", "#E;#F;#G", "#F;#G;#H", "#G;#H;#I", "#H;#I;#J", "#I;#J;#K",
"#J;#K;#L"), year = c(2017, 2016, 2021, 2020, 2020, 2020, 2019,
2018, 2020, 2020)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame")) -> df
df %>%
separate(terms, into = paste0("t", 1:3), sep = ";") %>%
pivot_longer(-year) %>%
group_by(year, value) %>%
count(value) %>%
ggplot(aes(x = year, y = n, fill = value, label = n)) +
geom_col(position = position_dodge()) +
geom_text(position = position_dodge(1))
Created on 2021-02-05 by the reprex package (v0.3.0)
To generate a nicely readable plot for each year in Base R, we can do the following:
Code
# First create a list of data.frames that we can utilize to plot
# Split by year
listdf <- split(df, df$year)
# Only choose trends and name list according to year
listdf <- lapply(listdf, "[[", 1)
# Stringsplit to obtain trends as vector for each year
listdfplot <- lapply(listdf, function(x){
unlist(strsplit(x, ";"))
})
# Plot
# Plot side by side
par(mfrow = c(2, 3))
# A barplot for each year
Map(function(x, y){
barplot(table(x), main = paste("Trends in", y), las = 2)
},
listdfplot,
names(listdfplot))
Data
df <- structure(list(terms = c("#A;#B;#C", "#B;#C;#D", "#C;#D;#E",
"#D;#E;#F", "#E;#F;#G", "#F;#G;#H", "#G;#H;#I", "#H;#I;#J", "#I;#J;#K",
"#J;#K;#L"), year = c(2017, 2016, 2021, 2020, 2020, 2020, 2019,
2018, 2020, 2020)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
You could make a new Data frame were every hashtag gets a unique information for year.
After that you can play around with the data using geom_bar.
I can not upload the plot to this post as this is a new account.
library(tidyverse)
library(data.table)
#your Data:
#terms year
#1 #A;#B;#C 2017
#2 #B;#C;#D 2016
#3 #C;#D;#E 2021
#4 #D;#E;#F 2020
#5 #E;#F;#G 2020
#6 #F;#G;#H 2020
#7 #G;#H;#I 2019
#8 #H;#I;#J 2018
#9 #I;#J;#K 2020
#10 #J;#K;#L 2020
# make a df that looks like your data:
terms<- c("#A;#B;#C",
"#B;#C;#D",
"#C;#D;#E",
"#D;#E;#F",
"#E;#F;#G",
"#G;#H;#I",
"#H;#I;#J",
"#I;#J;#K",
"#J;#K;#L")
terms<-as.data.frame(terms)
year<-c(2017,2016,2021,2020,2020,2019,2018,2020,2020)
year<-as.data.frame(year)
df<-cbind(terms,year)
# read your data from what I assume is your Data frame
terms<-c(df$terms)
year.list<-c(df$year)
loopcount<-length(terms)
# make new dummys
year<-c()
hashtags<-c()
all.years<-as.data.frame(hashtags,year)
#split hashtags based on ";"
hashtag.list<-str_split(terms, ";")
With this loop you create a new DF
# make new df were every hashtags gets the information for year
for (i in 1:loopcount){
hashtags<-hashtag.list[[i]]
hashtags<-as.data.frame(hashtags)
year<-c()
for(k in 1:nrow(hashtags)) {
year[k]<- year.list[i]
}
year<-as.data.frame(year)
one.year<-cbind(hashtags,year)
all.years<-rbind(all.years,one.year)
}
hashtagDF<-all.years
head(hashtagDF)
The new DF can then be used to to plot what you want
Or
if I understand you correct
you can make a new df were the frequency of the hashtags per year are shown
and only the top 3 hashtags are included
#only include the three most used hashtags per year
# dummys for new df
hashtags<-c()
year<-c()
Freq<-c()
top.3<-as.data.frame(hashtags,year,Freq)
years.in.study<-unique(hashtagDF$year)
#i<-3
for ( i in 1: length(years.in.study)){
what.year<-paste(years.in.study[i])
#subset per year
one.subset<-subset(hashtagDF, year == what.year)
# calculate how often a hashtag is present per year
freq<-table(one.subset)
frequency.per.year<-as.data.frame(freq)
frequency.per.year<-frequency.per.year[order(-frequency.per.year[,3]), ]
# only keep the 3 most occurring terms
lenght.of.file.to.delete<-nrow(frequency.per.year)
if (nrow(frequency.per.year) == 3){
lenght.of.file.to.delete<-lenght.of.file.to.delete+1
}
frequency.per.year<-frequency.per.year[-c(4:lenght.of.file.to.delete), ]
# make a df with all years
top.3<-rbind(top.3,frequency.per.year)
}
top.3
#order for year
top.3$year<-as.character(top.3$year)
top.3[order(top.3[,2]), ]
#year should be a factor
top.3$yearF<-as.factor(top.3$year)
Then you can plot it
# plot as barplot
# with
# the frequencies of the hashtags in different years.
# the top 3 most frequent hashtag terms per year
barplot<-ggplot(data=top.3, aes(x=yearF, y=Freq,fill=hashtags)) +
geom_bar(stat="identity")+
labs(title = "",
subtitle = "",
caption = "",
x= "",
y= "")
barplot
ggsave(file="hashtag.png", barplot, width = 210, height = 297, units = "mm")
It is hard to describe in words. Therefore, made a reprex
with input, output and expected output below
How can we data wrangle
1. When we function and mutate as shown below, there is ambiguity each time based on column name string
2. how can we rbind these once we have unique column names
library(tidyverse)
# Basically, "." means ",". So, better we remove . and PC and convert to Numeric
df1 <- tribble(
~`ABC sales 01.01.2019 - 01.02.2019`, ~code,
"1.019 PC", 2000, # Actually, it 1019 (remove . and PC )
"100 PC", 2101,
"3.440 PC", 2002
)
df2 <- tribble(
~`ABC sales 01.03.2019 - 01.04.2019`, ~year,
"6.019 PC", 2019,
"20 PC", 2001,
"043.440 PC", 2002
)
df3 <- tribble(
~`ABC sales 01.05.2019 - 01.06.2019`, ~year,
"1.019 PC", 2000,
"701 PC", 2101,
"6.440 PC", 2002
)
# Input data
input_df = list(df1,df2,df3)
#### function to clean data
# str_replace is used twice because
# remove PC and dot
data_read = function(file){
df_ <- df %>% #glimpse()
# Select the column to remove PC, spaces and .
# Each time, column name differs so, `ABC sales 01.01.2019 - 01.02.2019` cannot be used
mutate_at(sales_dot = str_replace(select(contains('ABC')), "PC",""),
sales = str_replace(sales_dot, "\\.",""), # name the new column so that rbind can be applied later
sales_dot = NULL, # delete the old column
vars(contains("ABC")) = NULL # delete the old column
)
df_
}
# attempt to resolve
# To clean the data from dots and PC
output_df1 <- map(input_df, data_read) # or lapply ?
# rbind
output = map(output_df1, rbind) # or lapply ?
expected_output <- df3 <- tribble(
~sales, ~year,
"1019", 2000,
"100", 2101,
"3440", 2002,
"6019", 2019,
"20", 2001,
"043440", 2002,
"1019", 2000,
"701", 2101,
"6440", 2002
)
Using purrr, dplyr and stringr, you can do:
map_df(.x = input_df, ~ .x %>%
set_names(., c("sales", "year"))) %>%
mutate(sales = str_remove_all(sales, "[. PC]"))
sales year
<chr> <dbl>
1 1019 2000
2 100 2101
3 3440 2002
4 6019 2019
5 20 2001
6 043440 2002
7 1019 2000
8 701 2101
9 6440 2002
I need some help to re-design the output of a function that comes through an R package.
My scope is to reshape a dataframe called output_IMFData in a way that look very similar to the shape of output_imfr. The codes of a MWE reproducing these dataframes are:
library(imfr)
output_imfr <- imf_data(database_id="IFS", indicator="IAD_BP6_USD", country = "", start = 2010, end = 2014, freq = "A", return_raw =FALSE, print_url = T, times = 3)
and for output_IMFData
library(IMFData)
databaseID <- "IFS"
startdate <- "2010"
enddate <- "2014"
checkquery <- FALSE
queryfilter <- list(CL_FREA = "A", CL_AREA_IFS = "", CL_INDICATOR_IFS = "IAD_BP6_USD")
output_IMFData <- CompactDataMethod(databaseID, queryfilter, startdate, enddate,
checkquery)
the output from output_IMFData looks like this:
But, I want to redesign this dataframe to look like the output of output_imfr:
Sadly, I am not that advanced user and could not find something that can help me. My basic problem in converting the shape of output_IMFData to the shape of the second ``panel-data-looking" dataframework is that I don't know how to handle the Obs in output_IMFData in a way that cannot lose the "correspondence" with the reference code #REF-AREA in output_IMFData. That is, in column #REF-AREA there are codes of country names and the column in Obs has their respective time series data. This is very cumbersome way of working with panel data, and therefore I want to reshape that dataframe to the much nicer form of output_imfr dataframe.
The data of interest are stored in a list in the column Obs. Here is a dplyr solution to split the data, crack open the list, then stitch things back together.
longData <-
output_IMFData %>%
split(1:nrow(.)) %>%
lapply(function(x){
data.frame(
iso2c = x[["#REF_AREA"]]
, x$Obs
)
}) %>%
bind_rows()
head(longData)
gives:
iso2c X.TIME_PERIOD X.OBS_VALUE X.OBS_STATUS
1 FJ 2010 47.2107721901621 <NA>
2 FJ 2011 48.28347 <NA>
3 FJ 2012 51.0823499999999 <NA>
4 FJ 2013 157.015648875072 <NA>
5 FJ 2014 186.623232882226 <NA>
6 AW 2010 616.664804469274 <NA>
Here's another approach:
NewDataFrame <- data.frame(iso2c=character(),
year=numeric(),
IAD_BP6_USD=character(),
stringsAsFactors=FALSE)
newrow = 1
for(i in 1:nrow(output_IMFData)) { # for each row of your cludgy df
for(j in 1:length(output_IMFData$Obs[[i]]$`#TIME_PERIOD`)) { # for each year
NewDataFrame[newrow,'iso2c']<-output_IMFData[i, '#REF_AREA']
NewDataFrame[newrow,'year']<-output_IMFData$Obs[[i]]$`#TIME_PERIOD`[j]
NewDataFrame[newrow,'IAD_BP6_USD']<-output_IMFData$Obs[[i]]$`#OBS_VALUE`[j]
newrow<-newrow + 1 # increment down a row
}
}
Below is the code I am trying to implement. I want to extract this 10 consecutive values of rows and turn them into corresponding columns .
This is how data looks like: https://drive.google.com/file/d/0B7huoyuu0wrfeUs4d2p0eGpZSFU/view?usp=sharing
I have been trying but temp1 and temp2 comes out to be empty. Please help.
library(Hmisc) #for increment function
myData <- read.csv("Clothing_&_Accessories.csv",header=FALSE,sep=",",fill=TRUE) # reading the csv file
extract<-myData$V2 # extracting the desired column
x<-1
y<-1
temp1 <- NULL #initialisation
temp2 <- NULL #initialisation
data.sorted <- NULL #initialisation
limit<-nrow(myData) # Calculating no of rows
while (x! = limit) {
count <- 1
for (count in 11) {
if (count > 10) {
inc(x) <- 1
break # gets out of for loop
}
else {
temp1[y]<-data_mat[x] # extracting by every row element
}
inc(x) <- 1 # increment x
inc(y) <- 1 # increment y
}
temp2<-temp1
data.sorted<-rbind(data.sorted,temp2) # turn rows into columns
}
Your code is too complex. You can do this using only one for loop, without external packages, likes this:
myData <- as.data.frame(matrix(c(rep("a", 10), "", rep("b", 10)), ncol=1), stringsAsFactors = FALSE)
newData <- data.frame(row.names=1:10)
for (i in 1:((nrow(myData)+1)/11)) {
start <- 11*i - 10
newData[[paste0("col", i)]] <- myData$V1[start:(start+9)]
}
You don't actually need all this though. You can simply remove the empty lines, split the vector in chunks of size 10 (as explained here) and then turn the list into a data frame.
vec <- myData$V1[nchar(myData$V1)>0]
as.data.frame(split(vec, ceiling(seq_along(vec)/10)))
# X1 X2
# 1 a b
# 2 a b
# 3 a b
# 4 a b
# 5 a b
# 6 a b
# 7 a b
# 8 a b
# 9 a b
# 10 a b
We could create a numeric index based on the '' values in the 'V2' column, split the dataset, use Reduce/merge to get the columns in the wide format.
indx <- cumsum(myData$V2=='')+1
res <- Reduce(function(...) merge(..., by= 'V1'), split(myData, indx))
res1 <- res[order(factor(res$V1, levels=myData[1:10, 1])),]
colnames(res1)[-1] <- paste0('Col', 1:3)
head(res1,3)
# V1 Col1 Col2 Col3
#2 ProductId B000179R3I B0000C3XXN B0000C3XX9
#4 product_title Amazon.com Amazon.com Amazon.com
#3 product_price unknown unknown unknown
From the p1.png, the 'V1' column can also be the column names for the values in 'V2'. If that is the case, we can 'transpose' the 'res1' except the first column and change the column names of the output with the first column of 'res1' (setNames(...))
res2 <- setNames(as.data.frame(t(res1[-1]), stringsAsFactors=FALSE),
res1[,1])
row.names(res2) <- NULL
res2[] <- lapply(res2, type.convert)
head(res2)
# ProductId product_title product_price userid
#1 B000179R3I Amazon.com unknown A3Q0VJTU04EZ56
#2 B0000C3XXN Amazon.com unknown A34JM8F992M9N1
#3 B0000C3XX9 Amazon.com unknown A34JM8F993MN91
# profileName helpfulness reviewscore review_time
#1 Jeanmarie Kabala "JP Kabala" 7/7 4 1182816000
#2 M. Shapiro 6/6 5 1205107200
#3 J. Cruze 8/8 5 120571929
# review_summary
#1 Periwinkle Dartmouth Blazer
#2 great classic jacket
#3 Good jacket
# review_text
#1 I own the Austin Reed dartmouth blazer in every color
#2 This is the second time I bought this jacket
#3 This is the third time I bought this jacket
I guess this is just a reshaping issue. In that case, we can use dcast from data.table to convert from long to wide format
library(data.table)
DT <- dcast(setDT(myData)[V1!=''][, N:= paste0('Col', 1:.N) ,V1], V1~N,
value.var='V2')
data
myData <- structure(list(V1 = c("ProductId", "product_title",
"product_price",
"userid", "profileName", "helpfulness", "reviewscore", "review_time",
"review_summary", "review_text", "", "ProductId", "product_title",
"product_price", "userid", "profileName", "helpfulness",
"reviewscore",
"review_time", "review_summary", "review_text", "", "ProductId",
"product_title", "product_price", "userid", "profileName",
"helpfulness",
"reviewscore", "review_time", "review_summary", "review_text"
), V2 = c("B000179R3I", "Amazon.com", "unknown", "A3Q0VJTU04EZ56",
"Jeanmarie Kabala \"JP Kabala\"", "7/7", "4", "1182816000",
"Periwinkle Dartmouth Blazer",
"I own the Austin Reed dartmouth blazer in every color", "",
"B0000C3XXN", "Amazon.com", "unknown", "A34JM8F992M9N1",
"M. Shapiro",
"6/6", "5", "1205107200", "great classic jacket",
"This is the second time I bought this jacket",
"", "B0000C3XX9", "Amazon.com", "unknown", "A34JM8F993MN91",
"J. Cruze", "8/8", "5", "120571929", "Good jacket",
"This is the third time I bought this jacket"
)), .Names = c("V1", "V2"), row.names = c(NA, 32L),
class = "data.frame")