input:
tmp <- "(12,'chinese'),(13,'italian'),(14,'spanish')"
desired output:
id food_type
12 chinese
13 italian
14 spanish
I tried gsub/string split but realized that feeding into new row and column is another problem. thanks
Here is a solution based on eval(parse(text = )) by convert the string into an expression:
x <- eval(parse(text = paste0('list(', gsub('\\(', 'c\\(', tmp), ')')))
res <- as.data.frame(do.call(rbind, x), stringsAsFactors = FALSE)
names(res) <- c('id', 'food_type')
res
# id food_type
# 1 12 chinese
# 2 13 italian
# 3 14 spanish
Using strsplit and sub:
tmp <- "(12,'chinese'),(13,'italian'),(14,'spanish')"
terms <- strsplit(tmp, "(?<=\\)),(?=\\()", perl=TRUE)
df <- lapply(terms[[1]], function(x) {
id <- sub("^\\(([^,]*).*", "\\1", x)
food_type <- sub(".*,'(.*)'\\)", "\\1", x)
z <- c(id, food_type)
return(z)
})
df <- do.call(rbind.data.frame, df)
names(df) <- c("id", "food_type")
df
id food_type
1 12 chinese
2 13 italian
3 14 spanish
Demo
Hey checkout this solution i hope it will help you.
tmp1=gsub("\\'","",gsub("\\(","",unlist(strsplit(unlist(strsplit(tmp,",")),"\\)"))))
id=as.numeric(tmp1[seq(1,length(tmp2),2)])
fooditem=tmp1[seq(0,length(tmp2),2)]
res=data.frame(id,fooditem)
id fooditem
1 12 chinese
2 13 italian
3 14 spanish
I was looking for a way to change the input and use read.table function to get the desired output. The final steps turned out to be:
df <- lapply(strsplit(tmp, "\\(|\\)\\,?", perl = TRUE), function(x){
x <- x[x != ""]
read.table(text = paste0(x), sep = ",", header = FALSE, stringsAsFactors = FALSE)
})
df <- do.call(rbind, df)
names(df) <- c("id", "food_type")
# Result:
#> df
# id food_type
#1 12 chinese
#2 13 italian
#3 14 spanish
Related
I would like to read in a table then use gsub to return a part of the text. I know gsub requires a character vector format. Instead of getting the desired samp list of 'C516_A1_B1' and pat list of'C516' etc, I get'1:5'. What is the simplest way to fix this? Thanks!
bamlist <- read.table('pathtotxtfile.txt')
for (y in bamlist) {
samp <- gsub('EPICC_(C\\S+)_S1\\S+$','\\1', bamlist)
pat <- gsub('(C\\d+)_\\S+$','\\1', samp)
}
bamlist:
EPICC_C516_A1_B1_S1-GRCh38.bam
EPICC_C516_A1_G4_S1-GRCh38.bam
EPICC_C516_B1_G7_S1-GRCh38.bam
EPICC_C516_B1_G8_S1-GRCh38.bam
EPICC_C516_B3_B1_S1-GRCh38.bam
Why loop, sub is vectorized over x.
samp <- sub("^[^_]*_(.*)_[^_]*$", "\\1", bamlist)
pat <- sub("(^[^_]+)_.*$", "\\1", samp)
samp
#[1] "C516_A1_B1" "C516_A1_G4" "C516_B1_G7" "C516_B1_G8"
#[5] "C516_B3_B1"
pat
#[1] "C516" "C516" "C516" "C516" "C516"
Data
bamlist <- scan(what = character(), text = "
EPICC_C516_A1_B1_S1-GRCh38.bam
EPICC_C516_A1_G4_S1-GRCh38.bam
EPICC_C516_B1_G7_S1-GRCh38.bam
EPICC_C516_B1_G8_S1-GRCh38.bam
EPICC_C516_B3_B1_S1-GRCh38.bam
")
Edit
Following user #akrun's comment, here is a way to apply the above code to a data.frame.
lapply(bamlist, function(y){
samp <- sub("^[^_]*_(.*)_[^_]*$", "\\1", y)
pat <- sub("(^[^_]+)_.*$", "\\1", samp)
data.frame(samp = samp, pat = pat)
})
#$X
# samp pat
#1 C516_A1_B1 C516
#2 C516_A1_G4 C516
#3 C516_B1_G7 C516
#4 C516_B1_G8 C516
#5 C516_B3_B1 C516
The data would now be
X <- scan(what = character(), text = "
EPICC_C516_A1_B1_S1-GRCh38.bam
EPICC_C516_A1_G4_S1-GRCh38.bam
EPICC_C516_B1_G7_S1-GRCh38.bam
EPICC_C516_B1_G8_S1-GRCh38.bam
EPICC_C516_B3_B1_S1-GRCh38.bam
")
bamlist <- data.frame(X)
I'm trying to find a way to lookup multiple values in a dataframe and return a value. Simplified example:
df1 <- read.table(text="chk1 chk2 chk3 value
xx aa;bb;cc jj 1
xx;yy dd;ee;ff kk 2
zz gg;hh;ii ll;nn 3", header=T)
df2 <- read.table(text="val1 val2 val3
xx bb jj
xx dd kk
yy ee kk
zz hh jj
", header=T)
Lookup values val1, val2, and val3 from df2 in df1, return value from df1.
Desired results:
df2 <- read.table(text="
val1 val2 val3 value
xx bb jj 1
xx dd kk 2
yy ee kk 2
zz hh jj NA
")
Tried match x %in% y and looping over the rows, can't get it to work.
Here is one possibility:
library(tidyverse)
df3 <- df2 %>% rowwise %>%
mutate(rowmatch=which(grepl(val1, df1$chk1) &
grepl(val2, df1$chk2) &
grepl(val3, df1$chk3))[1],
value=df1$value[rowmatch])
Result:
# A tibble: 4 x 5
val1 val2 val3 rowmatch value
<chr> <chr> <chr> <int> <int>
1 xx bb jj 1 1
2 xx dd kk 2 2
3 yy ee kk 2 2
4 zz hh jj NA NA
Notes:
the [1] is to ensure that only first of the matching rows is used.
note that although rowmatch and value are identical in this example this is only because df1$value is equal to the row number.
tibble behaves like a data.frame, but if you really prefer a data frame, add %>% as.data.frame
The same can be done with base R and apply:
df2$rowmatch <- with(df1, apply(df2, 1, function(x)
which(grepl(x["val1"], chk1) &
grepl(x["val2"], chk2) &
grepl(x["val3"], chk3))[1]))
df2$value <- df1$value[df2$rowmatch]
another option would be splitting the values first:
df1 <- df1 %>%
splitstackshape::cSplit("chk1", ";", fixed = TRUE, direction = "long", drop = FALSE, type.convert = FALSE) %>%
splitstackshape::cSplit("chk2", ";", fixed = TRUE, direction = "long", drop = FALSE, type.convert = FALSE) %>%
splitstackshape::cSplit("chk3", ";", fixed = TRUE, direction = "long", drop = FALSE, type.convert = FALSE)
and then using join
You can also do it using two nested for loops. The logic is to take first row of df2 and then start going through rows of df1 to see if df2$val1 matches df1$chk, df2$val2 matches df1$chk2 and df2$val3 matches df1$chk3. I consider all values a match if there is at least one match per column. The caverat here is that if df2 does not have unique rows, the last matching row from df1 will be written to df2. But this can be changed by breaking out of the loop as soon as the match is found.
for (i in 1:nrow(df2)) {
for (j in 1:nrow(df1)) {
# Take i-th row and split by ;. Result is a vector of strings against
# which we'll use match.
i.split <- strsplit(as.character(unlist(df1[j, , drop = TRUE][-4])), ";")
# Pairwise check columns from df1 and df2.
all.ok <- all(mapply(FUN = function(x, y) {
any(x %in% y)
}, x = i.split, y = as.list(df2[i, 1:3])
))
if (all.ok) {
# If a match is found, write the value to df2.
df2[i, "value"] <- df1[j, "value"]
}
}
}
Output:
val1 val2 val3 value
1 xx bb jj 1
2 xx dd kk 2
3 yy ee kk 2
4 zz hh jj NA
How can I extract the user_id from the retweets collected using this function?
## get only first 8 words from each tweet
x <- lapply(strsplit(dat$text, " "), "[", 1:8)
x <- lapply(x, na.omit)
x <- vapply(x, paste, collapse = " ", character(1))
## get rid of hyperlinks
x <- gsub("http[\\S]{1,}", "", x, perl = TRUE)
## encode for search query (handles the non ascii chars)
x <- sapply(x, URLencode, USE.NAMES = FALSE)
## get up to first 100 retweets for each tweet
data <- lapply(x, search_tweets, verbose = FALSE)
I have 12 elements, each contains a list of user ids, how can I extract the user ids only?
here is the full code:
library(rtweet)
library(dplyr)
library(plyr)
require(reshape2)
## search for day of rage tweets, try to exclude rt here
dor <- search_tweets("#Newsnight -filter:retweets", n = 10000)
## merge tweets data with unique (non duplicated) users data
## exclude retweets
## select status_id, retweet count, followers count, and text columns
dat <- dor %>%
users_data() %>%
unique() %>%
right_join(dor) %>%
filter(!is_retweet) %>%
dplyr::select(user_id, screen_name, retweet_count, followers_count, text) %>%
filter(retweet_count >=50 & retweet_count <100 & followers_count < 10000 & followers_count > 500)
dat
## get only first 8 words from each tweet
x <- lapply(strsplit(dat$text, " "), "[", 1:8)
x <- lapply(x, na.omit)
x <- vapply(x, paste, collapse = " ", character(1))
## get rid of hyperlinks
x <- gsub("http[\\S]{1,}", "", x, perl = TRUE)
## encode for search query (handles the non ascii chars)
x <- sapply(x, URLencode, USE.NAMES = FALSE)
## get up to first 100 retweets for each tweet
data <- lapply(x, search_tweets, verbose = FALSE)
There are 11 more elements like this
12 elements
Ok, so you have a list of 12 dataframes, each has a column called user_id. if the list is named, then this will work, if it isn't named, then take out the df_name = names(data)[x], part.
lapply(1:12, function(x) {
df <- data[[x]]
data.frame(user_id = df$user_id,
# df_name = names(data)[x],
df_number = x, stringsAsFactors=FALSE) } ) %>%
dplyr::bind_rows()
That should give you a new dataframe with all of the userids and which previous dataframe they came from.
I'd like to make a column of (possibly) non-unique strings into a column of unique strings.
For instance, consider:
df <- data.frame(
'Initials' = c("AA","AB","AB")
, 'Data' = c(1,2,3)
)
df
Initials Data
1 AA 1
2 AB 2
3 AB 3
I would like to obtain this:
Initials Data
1 AA 1
2 AB (1) 2
3 AB (2) 3
Note: I know I could use the rownames to uniquely identify the row, but I'd like to retain the string stored in the Initials column, with a number appended.
transform(df, Initials = ave(as.character(Initials), Initials,
FUN = function(x) if (length(x) > 1) paste0(x, " (", seq(x), ")") else x))
# Initials Data
# 1 AA 1
# 2 AB (1) 2
# 3 AB (2) 3
w <- ave(df$Data, df$Initials, FUN = seq_along )
> df$Initials <- paste(df$Initials, "(", w, ")", sep = "")
# > df
# Initials Data
# 1 AA(1) 1
# 2 AB(1) 2
# 3 AB(2) 3
TLDR
Can use make_unique from the makeunique package (Disclaimer: I'm the author)
# install.packages('makeunique')
library(makeunique)
df <- data.frame(
Initials = c("AA","AB","AB"),
Data = c(1,2,3)
)
df[['Initials']] <- make_unique(df[['Initials']])
df
#> Initials Data
#> 1 AA 1
#> 2 AB (1) 2
#> 3 AB (2) 3
Created on 2022-10-15 by the reprex package (v2.0.1)
Full Answer
Disclaimer: I'm the author the makeunique package
The problem with #Sven Hohenstein's answer is that it can still produce duplicates. For example if your input dataset happens to include a string 'AB (2)'. See example of issue below
df <- data.frame(
Initials = c("AA","AB","AB", "AB (2)"),
Data = c(1,2,3,4)
)
transform(df, Initials = ave(as.character(Initials), Initials,
FUN = function(x) if (length(x) > 1) paste0(x, " (", seq(x), ")") else x))
#> Initials Data
#> 1 AA 1
#> 2 AB (1) 2
#> 3 AB (2) 3
#> 4 AB (2) 4
Created on 2022-10-15 by the reprex package (v2.0.1)
Really you'd either want something like make_unique from the makeunique package that warns you when this happens and lets you change suffix formatting to fix the issue
# install.packages('makeunique')
library(makeunique)
df <- data.frame(
Initials = c("AA","AB","AB", "AB (2)"),
Data = c(1,2,3,4)
)
df[['Initials']] <- make_unique(df[['Initials']])
#> Error in make_unique(df[["Initials"]]): make_unique failed to make vector unique.
#> This is because appending ' <dup_number>' to duplicate values led tocreation of term(s) that were in the original dataset:
#> [AB (2)]
#>
#> Please try again with a different argument for either `wrap_in_brackets` or `sep`
# Change suffix format to fix problem
df[['Initials']] <- make_unique(df[['Initials']], sep = "-")
Created on 2022-10-15 by the reprex package (v2.0.1)
Or, you could make.unique from base R that ensures uniqueness but doesn't let you control how your suffixes look.
If you don't want to use a package, feel free to just copy the source function for make_unique into your code and use it like your own function
make_unique <- function(x, sep = " ", wrap_in_brackets = TRUE, warn_about_type_conversion = TRUE){
if(!(is.character(sep) & length(sep) == 1)) stop('`sep` must be a string, not a ', paste0(class(sep), collapse = " "))
if(!(is.logical(wrap_in_brackets) & length(wrap_in_brackets) == 1)) stop('`wrap_in_brackets` must be a flag, not a ', paste0(class(wrap_in_brackets), collapse = " "))
if(!(is.logical(warn_about_type_conversion) & length(warn_about_type_conversion) == 1)) stop('`warn_about_type_conversion` must be a flag, not a ', paste0(class(warn_about_type_conversion), collapse = " "))
if(!any(is.numeric(x),is.character(x),is.factor(x))) stop('input to `make_unique` must be a character, numeric, or factor variable')
if(is.factor(x)) {
if(warn_about_type_conversion) warning('make_unique: Converting factor to character variable')
x <- as.character(x)
}
else if(is.numeric(x)) {
if(warn_about_type_conversion) warning('make_unique: Converting numeric variable to a character vector')
x <- as.character(x)
}
deduplicated = stats::ave(x, x, FUN = function(a){
if(length(a) > 1){
suffixes <- seq_along(a)
if(wrap_in_brackets) suffixes <- paste0('(', suffixes, ')')
paste0(a, sep, suffixes)
}
else {a}
})
values_still_duplicated <- deduplicated[duplicated(deduplicated)]
if(length(stats::na.omit(values_still_duplicated)) > 0){
stop(
"make_unique failed to make vector unique.\n",
"This is because appending ' <dup_number>' to duplicate values led to",
"creation of term(s) that were in the original dataset: \n[",
paste0(values_still_duplicated, collapse = ', '),
"]\n\nPlease try again with a different argument for either `wrap_in_brackets` or `sep`"
)
}
return(deduplicated)
}
I'm thumbling around with the following problem, but to no evail:
d <- data.frame(value = 1:4, row.names = c("abc", "abcd", "ef", "gh"))
value
abc 1
abcd 2
ef 3
gh 4
l <- nrow(d)
wordmat <- matrix(rep(NA, l^2), l, l, dimnames = list(row.names(d), row.names(d)))
for (i in 1:ncol(wordmat)) {
rid <- agrep(colnames(wordmat)[i], rownames(wordmat), max = 0)
d$matchid[i] <- paste(rid, collapse = ";")
}
# desired output:
(d_agg <- data.frame(value = c(3, 3, 4), row.names = c("abc;abcd", "ef", "gh")))
value
abc;abcd 3
ef 3
gh 4
is there a function for this?
Here's a possible solution that you might be able to modify to suit your needs.
Some notes:
I couldn't figure out how to deal with rownames() directly, particularly in the last stage, so this depends on you being happy with copying your row names as a new variable.
The function below "hard-codes" the variable names, functions, and so on. That is to say, it is not by any means a generalized function, but one which might be useful as you look further into this problem.
Here's the function.
matches <- function(data, ...) {
temp = vector("list", nrow(data))
for (i in 1:nrow(data)) {
temp1 = agrep(data$RowNames[i], data$RowNames, value = TRUE, ...)
temp[[i]] = data.frame(RowNames = paste(temp1, collapse = "; "),
value = sum(data[temp1, "value"]))
}
temp = do.call(rbind, temp)
temp[!duplicated(temp$RowNames), ]
}
Note that the function needs a column called RowNames, so we'll create that, and then test the function.
d <- data.frame(value = 1:4, row.names = c("abc", "abcd", "ef", "gh"))
d$RowNames <- rownames(d)
matches(d)
# RowNames value
# 1 abc; abcd 3
# 3 ef 3
# 4 gh 4
matches(d, max.distance = 2)
# RowNames value
# 1 abc; abcd 3
# 3 abc; abcd; ef; gh 10
matches(d, max.distance = 4)
# RowNames value
# 1 abc; abcd; ef; gh 10
This works for your example but may need tweaking for the real thing:
d <- data.frame(value = 1:4, row.names = c("abc", "abcd", "ef", "gh"))
rowclust <- hclust(as.dist(adist(rownames(d))), method="single")
rowgroups <- cutree(rowclust, h=1.5)
rowagg <- aggregate(d, list(rowgroups), sum)
rowname <- unclass(by(rownames(d), rowgroups, paste, collapse=";"))
rownames(rowagg) <- rowname
rowagg
Group.1 value
abc;abcd 1 3
ef 2 3
gh 3 4