R: adding matching vector values from two dataframes in one column - r

I have a data frame which is configured roughly like this:
df <- cbind(c('hello', 'yes', 'example'),c(7,8,5),c(0,0,0))
words
frequency
count
hello
7
0
yes
8
0
example
5
0
What I'm trying to do is add values to the third column from a different data frame, which is similiar but looks like this:
df2 <- cbind(c('example','hello') ,c(5,6))
words
frequency
example
5
hello
6
My goal is to find matching values for the first column in both data frames (they have the same column name) and add matching values from the second data frame to the third column of the first data frame.
The result should look like this:
df <- cbind(c('hello', 'yes', 'example'),c(7,8,5),c(6,0,5))
words
frequency
count
hello
7
6
yes
8
0
example
5
5
What I've tried so far is:
df <- merge(df,df2, by = "words", all.x=TRUE)
However, it doesn't work.
I could use some help understanding how could it be done. Any help will be welcome.

This is an "update join". My favorite way to do it is in dplyr:
library(dplyr)
df %>% rows_update(rename(df2, count = frequency), by = "words")
In base R you could do the same thing like this:
names(df2)[2] = "count2"
df = merge(df, df2, by = "words", all.x=TRUE)
df$count = ifelse(is.na(df$coutn2), df$count, df$count2)
df$count2 = NULL

Here is an option with data.table:
library(data.table)
setDT(df)[setDT(df2), on = "words", count := i.frequency]
Output
words frequency count
<char> <num> <num>
1: hello 7 6
2: yes 8 0
3: example 5 5
Or using match in base R:
df$count[match(df2$words, df$words)] <- df2$frequency
Or another option with tidyverse using left_join and coalesce:
library(tidyverse)
left_join(df, df2 %>% rename(count.y = frequency), by = "words") %>%
mutate(count = pmax(count.y, count, na.rm = T)) %>%
select(-count.y)
Data
df <- structure(list(words = c("hello", "yes", "example"), frequency = c(7,
8, 5), count = c(0, 0, 0)), class = "data.frame", row.names = c(NA,
-3L))
df2 <- structure(list(words = c("example", "hello"), frequency = c(5, 6)), class = "data.frame", row.names = c(NA,
-2L))

Related

find a row that has a string that contains a certain string, then take the row on top, the strong row and row under and move it to a new dataframe

So i have a table that looks like this:
I want to search though the first column for every time i see nl.audio take the row on top, take the nl.audio row and the row right under it and move them to a new column so it looks like this:
not sure how to go about doing this.
the table comes from trying to get nested json values into a dataframe. like this
library(jsonlite)
library(tidyverse)
files <- list.files(path=".", pattern=".json", all.files=FALSE,
full.names=FALSE)
data <- fromJSON(files[1])
dat2 <- unlist(data$translation_map)
dat2 <- as.data.frame(dat2)
dput:
structure(list(dat2 = c("Iraat.",
" _1645805605.mp3",
"Ie.", "wn", "", "Wdis.",
"ewdewf.mp3",
"wedew.", "[k]ws.[/k]",
" _1645805740.mp3",
"edwedwedw.", "Ik ewwewe[/k].",
"we45805760.mp3",
"I h89.", "ewd3n", "", "ad23dt", "",
"Ik d2. ", "I d2d3.",
"Ha3d3d/k] 20.", "H3d20.",
"id3n", "", "straat")), row.names = c("str-5e854867d9c6.nl.value",
"str_f15f7751-227dc6.nl.audio", "str_f15f7751.en.value",
"str.nl.value", "str_172a516ca.en.value",
"str_4567f686.nl.value", "str_4.nl.audio",
"stcb0ca14.en.value", "str_622f99395.nl.value",
"str_622f9395.nl.audio", "str_622f90de9395.en.value",
"str_f25afe16.nl.value", "str_f2fad09045afe16.nl.audio",
"str_f2fad89045afe16.en.value", "s9e844c432e80.nl.value",
"str_b0c1b42e80.en.value", "str_e6d847f3-60b7-.nl.value",
"str_.en.value", "str_b61f9404-.nl.value",
"str_ b.en.value", "str_76e28ea6.nl.value",
"str-61a1b83bf1ba.en.value", "str_6280d5a49c42a24.nl.value",
"str5-0d5a49c42a24.en.value", "str_5e6b2202e748.nl.value"
), class = "data.frame")
Something like this:
library(dplyr)
library(stringr)
df %>%
mutate(across(,str_squish)) %>%
mutate(A = ifelse(str_detect(V1, 'nl.audio'), lag(V2), NA_character_),
# B = str_extract(V2, '\\d+.mp3'),
B = str_extract(V2, '.*.mp3$'),
C = ifelse(str_detect(V1, 'nl.audio'), lead(V2), NA_character_),
.keep= "unused") %>%
na.omit()
A B C
2 nstraat. 1645805605.mp3 constraat.
7 tihdhis. 645805622.mp3 use.
df <- structure(list(V1 = c("str_f15d9c6.nl.value", "47c-5e854867d9c6.nl.audio",
"5e854867d9c6.en.value", "92bd-91b8f180bd3a.nl.value", "4-92bd-91b8f180bd3a.en.value",
"40a8-88ef-5890ecbOca14.nl.value", "890ecbOca14.nl.audio", "ca14.en.value"
), V2 = c("\tnstraat.", "\t1645805605.mp3", "\tconstraat.", "\tlemons",
" \t", "\ttihdhis.", "\t645805622.mp3", "\tuse.")), class = "data.frame", row.names = c(NA,
-8L))
We may need grep to find the index. Then add and subtract 1 to the index and extract the values from the second column based on that index (assuming data.frame columns)
i1 <- grep("nl.audio", df1[[1]], fixed = TRUE)
prev_ind <- i1-1
next_ind <- i1 + 1
data.frame(col1 = df1[[2]][prev_ind],
col2 = df1[[2]][next_ind],
col3 = df1[[2]][next_ind + 1])

Splitting strings into components

For example, I have a data table with several columns:
column A column B
key_500:station and loc 2
spectra:key_600:type 9
alpha:key_100:number 12
I want to split the rows of column A into components and create new columns, guided by the following rules:
the value between "key_" and ":" will be var1,
the next value after ":" will be var2,
the original column A should retain the part of string that is prior to ":key_". If it is empty (as in the first line), then replace "" with an "effect" word.
My expected final data table should be like this one:
column A column B var1 var2
effect 2 500 station and loc
spectra 9 600 type
alpha 12 100 number
Using tidyr extract you can extract specific part of the string using regex.
tidyr::extract(df, columnA, into = c('var1', 'var2'), 'key_(\\d+):(.*)',
convert = TRUE, remove = FALSE) %>%
dplyr::mutate(columnA = sub(':?key_.*', '', columnA),
columnA = replace(columnA, columnA == '', 'effect'))
# columnA var1 var2 columnB
#1 effect 500 station and loc 2
#2 spectra 600 type 9
#3 alpha 100 number 12
If you want to use data.table you can break this down in steps :
library(data.table)
setDT(df)
df[, c('var1', 'var2') := .(sub('.*key_(\\d+).*', '\\1',columnA),
sub('.*key_\\d+:', '', columnA))]
df[, columnA := sub(':?key_.*', '', columnA)]
df[, columnA := replace(columnA, columnA == '', 'effect')]
data
df <- structure(list(columnA = c("key_500:station and loc",
"spectra:key_600:type", "alpha:key_100:number"),
columnB = c(2L, 9L, 12L)), class = "data.frame", row.names = c(NA, -3L))
You can use separate which uses non-letters and separates the string into columns defined in into
require(tidyr)
require(dplyr)
df=tribble(
~"column A",~"column B",
"key_500:station", 2,
"spectra:key_600:type", 9,
"alpha:key_100:number", 12)
df %>% separate("column A",into=c('column A','key','var1','var2'),fill='left') %>% select(-key) %>% select("column A","column B",var1,var2) %>%
mutate(`column A`=ifelse(is.na(`column A`),"effect",`column A`))
And this is a modified version to work with data.tables
require(tidyr)
require(data.table)
DT=data.table(
"column A"=
c("key_500:station and loc",
"spectra:key_600:type",
"alpha:key_100:number"),
"column B"=c(2,9,12))
DT=separate(sep = "[^[:alnum:] ]+",DT,"column A",into=c('column A','key','var1','var2'),fill='left')
DT$key=NULL
DT$`column A`=ifelse(is.na(DT$`column A`),"effect",DT$`column A`)
DT=DT[,c(1,4,2,3)]

How do I aggregate data in R in a way that returns the entire row that satisfies the aggregation condition? [no dplyr]

I have data that looks like this:
ID FACTOR_VAR INT_VAR
1 CAT 1
1 DOG 0
I want to aggregate by ID such that the resulting dataframe contains the entire row that satisfies my aggregate condition. So if I aggregate by the max of INT_VAR, I want to return the whole first row:
ID FACTOR_VAR INT_VAR
1 CAT 1
The following will not work because FACTOR_VAR is a factor:
new_data <- aggregate(data[,c("ID", "FACTOR_VAR", "INT_VAR")], by=list(data$ID), fun=max)
How can I do this? I know dplyr has a group by function, but unfortunately I am working on a computer for which downloading packages takes a long time. So I'm looking for a way to do this with just vanilla R.
If you want to keep all the columns, use ave instead :
subset(df, as.logical(ave(INT_VAR, ID, FUN = function(x) x == max(x))))
You can use aggregate for this. If you want to retain all the columns, merge can be used with it.
merge(aggregate(INT_VAR ~ ID, data = df, max), df, all.x = T)
# ID INT_VAR FACTOR_VAR
#1 1 1 CAT
data
df <- structure(list(ID = c(1L, 1L), FACTOR_VAR = structure(1:2, .Label = c("CAT", "DOG"), class = "factor"), INT_VAR = 1:0), class = "data.frame", row.names = c(NA,-2L))
We can do this in dplyr
library(dplyr)
df %>%
group_by(ID)
filter(INT_VAR == max(INT_VAR))
Or using data.table
library(data.table)
setDT(df)[, .SD[INT_VAR == max(INT_VAR)], by = ID]

if cell equal to the cell below then perform an action in R

In R I am trying to go through a df and if the data in a column matches the data in the row below it, make the data from another column match too. I haven't really got to far other then considering a FOR loop such as below. Other options are to use an if statement but I'm sure there is a better way.
For (row in 1:nrow(df)){ insert code here maybe}
An example of my data is below
id name
1 M1
2 M1
3 M1
4 M2
5 M2
I would like it to look like this
id name
1 M1
1 M1
1 M1
4 M2
4 M2
We group by 'name' and mutate to 'id' to get the first element of 'id'
library(dplyr)
df1 %>%
group_by(name) %>%
mutate(id = first(id))
Or with data.table
library(data.table)
setDT(df1)[, id := first(id), by = name]
Or in base R
df1$id <- with(df1, ave(id, name, FUN = function(x) x[1]))
data
df1 <- structure(list(id = 1:5, name = c("M1", "M1", "M1", "M2", "M2"
)), class = "data.frame", row.names = c(NA, -5L))

Create a sequence of strings

Given a data set similar to the following
dat = structure(list(OpportunityId = c("006a000000zLXtZAAW", "006a000000zLXtZAAW",
"006a000000gst", "006a000000gstg", "006a000000gstg",
"006a000000zLXtZAAW"), IsWon = c(1, 1, 1, 1, 1, 1),
sequence = c("LLLML", "LHHHL", "LLLML", "HMLLL", "LLLLL", "LLLLL")),
.Names = c("OpportunityId","IsWon", "sequence"), row.names = c(NA, 6L), class = "data.frame")
dat
How would one go about adding each sequence that is associated with a particular opportunity id, such that the final looks like.
oppid sequence
006... LLL, LML, MMM
007... MMM, MML, MMH, LLL, HHH
007... LML, MMM
Any ideas?
We can paste the 'sequence' after grouping by 'OpportunityId'
library(data.table)
setDT(dat)[, .(sequence = toString(unique(sequence))) ,
by = .(oppid = OpportunityId)]
Maybe a combination of aggregate and unique could help.
aggregate(sequence ~ OpportunityId, dat, unique)
# OpportunityId sequence
#1 006a000000gst LLLML
#2 006a000000gstg HMLLL, LLLLL
#3 006a000000zLXtZAAW LLLML, LHHHL, LLLLL
As pointed out by #akrun in a comment, the sequence column is stored as a list in this case.
If necessary, the list in the sequence column can be converted into character format (a single string for each row) by means of:
dat$sequence <- sapply(dat$sequence, paste, collapse=", ")
With dplyr
library(dplyr)
dat_new <- dat %>%
group_by(OpportunityId) %>%
summarise(sequence = toString(sequence)) %>%
distinct(.keep_all = TRUE)
Output
# OpportunityId IsWon sequence
# 1 006a000000zLXtZAAW 1 LLLML, LHHHL, LLLLL
# 2 006a000000gst 1 LLLML
# 3 006a000000gstg 1 HMLLL, LLLLL

Resources