Numbers stick together as characters - r

I have a dataset with measured values (txt file, whitespace separated) and some numbers stick together like this:
Currently all columns are of class "character", since after conversion those pasted numbers got "NA"s. I created a routine for negative numbers, which was easy so far:
findandreplace <- function(file_name){
dat <- read_table2(file_name, col_names = FALSE)
for (n in 0:9) {
dat <- data.frame(lapply(dat, function(x) {gsub(paste0(n, "-"), paste0(n, " -"), x)}))
}
#save dat as txt and read it again
}
But now, I have no idea how to separate positive values. If you want you can use this MWE:
b = c("340.9","341","316.1","336.8316.39","378.8","315","386.57317.33",NA,NA)
a =c(1,2,3,4,5,6,7,8,9)
c = data.frame(a,b)
This is how it should be:
b = c("340.9","341","316.1","336.8","316.39","378.8","315","386.57", "317.33")
a =c(1,2,3,4,5,6,7,8,9)
c = data.frame(a,b)

x=unlist(strsplit(gsub("(.*)(3(?>\\d{2}\\.))","\\1 \\2",b,perl=T)," "))
grep("\\d",x,value = T)
[1] "340.9" "341" "316.1" "336.8" "316.39" "378.8" "315" "386.57" "317.33"
transform(c,b=grep("\\d",x,value = T))
a b
1 1 340.9
2 2 341
3 3 316.1
4 4 336.8
5 5 316.39
6 6 378.8
7 7 315
8 8 386.57
9 9 317.33

Related

Counting the frequency of differing patterns in a character string

I currently have a string in R that looks like this:
a <- "BMMBMMMMBMMMBMMBBMMM"
First, I need to determine the frequency of different patterns of "M" that appear in the string.
In this example it would be:
MM = 2
MMM = 2
MMMM = 1
Secondly, I then need to designate a numerical value/score for each different pattern.
i.e:
MM = 1
MMM = 2
MMMM = 3
This would mean that the total value/score of M's in a would equal 9.
If anyone knows any script that would allow me to do this for multiple strings like this in a dataframe that would be great?
Thank you.
a <- "BMMBMMMMBMMMBMMBBMMM"
tbl <- table(strsplit(a, "B"), exclude="")
tbl
# MM MMM MMMM
# 2 2 1
score <- sum(tbl * 1:3)
score
# 9
You could also use the table function.
a_list<-unlist(strsplit(a,"B"))
a_list<-a_list[!a_list==""] #remove cases when 2 B are together
a_list<-table(a_list)
# MM MMM MMMM
# 2 2 1
Here's a solution that uses the dplyr package. First, I load the library and define my string.
library(dplyr)
a <- "BMMBMMMMBMMMBMMBBMMM"
Next, I define a function that counts the occurrences of character x in string y.
char_count <- function(x, y){
# Get runs of same character
tmp <- rle(strsplit(y, split = "")[[1]])
# Count runs of character stored in `x`
tmp <- data.frame(table(tmp$lengths[tmp$values == x]))
# Return strings and frequencies
tmp %>%
mutate(String = strrep(x, Var1)) %>%
select(String, Freq)
}
Then, I run the function.
# Run the function
res <- char_count("M", a)
# String Freq
# 1 M 2
# 2 MM 2
# 3 MMM 1
Finally, I define my value vector and calculate the total value of vector a.
# My value vector
value_vec <- c(M = 1, MM = 2, MMM = 3)
# Total `value` of vector `a`
sum(value_vec * res$Freq)
#[1] 9
It it's acceptable to skip the first step you could do:
nchar(gsub("(B+M)|(^M)","",a))
# [1] 9
First compute all diffrent patterns that appear in your sting :
a <- "BMMBMMMMBMMMBMMBBMMM"
chars = unlist(strsplit(a, ""))
pat = c()
for ( i in 1:length(chars)){
for (j in 1:(length(chars) - i+1)){ pat = c(pat, paste(chars[j:(j+i-1)], collapse = ""))}}
pat =sort(unique(pat))
pat[1:5] : [1] "B" "BB" "BBM" "BBMM" "BBMMM"
Next, count the occurence of each pattern :
counts = sapply(pat, function(w) length(gregexpr(w, a, fixed = TRUE)[[1]]))
Finally build a nice dataframe to summary everything up :
df = data.frame(counts = counts, num = 1:length(pat))
head(df, 10)
counts num
B 6 1
BB 1 2
BBM 1 3
BBMM 1 4
BBMMM 1 5
BM 5 6
BMM 5 7
BMMB 2 8
BMMBB 1 9
BMMBBM 1 10
library(stringr)
str_count(a, "MMMM")
gives 1
str_count(gsub("MMMM", "", a), "MMM") # now count how many times "MMM" occurs, but first delete the "MMMM"
gives 2
str_count(gsub("MMM", "", a), "MM") #now count how many times "MM" occurs, but first delete the "MMM"'s
gives 2

How can I get a character string in r to refer to a particular variable in a data table?

I have a data frame with variables labeled var1-var38 and I am trying to multiply var1*var20-38 and add these new variables to the existing data frame. My issue is that I can create a string with var20, but it is not referring to var20 in the data frame. My code is as follows:
inter.data <- read.csv("interactions.csv", header = T, sep = ",")
a <- 20
while(a <= 38) {
name <- paste("var", a, sep = "")
inter.data[paste("var1*", name, sep="")] <- NA
inter.data$var1*name <- (inter.data$var1)*(inter.data$name)
a <- a+1
}
I have tried
inter.data <- read.csv("interactions.csv", header = T, sep = ",")
a <- 20
attach(inter.data)
while(a <= 38) {
name <- paste("var", a, sep = "")
inter.data[paste("var1*", name, sep="")] <- NA
inter.data$var1*name <- (var1)*(name)
a <- a+1
}
as well
I don't think you need a for loop for this. You can add the columns you are trying to create as follows. In this example, we have a dataframe with four columns, and we want to add the column var1*var3 and var1* var4
df = data.frame(var1=c(2,2,2),var2=c(1,2,3),var3=c(4,5,6),var4=c(7,8,9))
new_df <- df[,3:4] * df[,"var1"]
colnames(new_df) = paste0("var1*",colnames(new_df))
cbind(df,new_df)
output:
var1 var2 var3 var4 var1*var3 var1*var4
1 2 1 4 7 8 14
2 2 2 5 8 10 16
3 2 3 6 9 12 18
Hope this helps!

Update column name unless it exists in other vector

I want to add something on the end of all column names in a dataframe, unless the column name exists in another given vector.
For example say I have
df <- data.frame('my' = c(1,2,3),
'data' = c(4,5,6),
'is' = c(7,8,9),
'here' = c(10,11,12))
dont_update <- c('my', 'is')
to_add <- '_new'
And I want to end up with
my data_new is here_new
1 1 4 7 10
2 2 5 8 11
3 3 6 9 12
A bit verbose, but this works
to_update <- names(df)[!names(df) %in% dont_update]
names(df)[match(to_update, names(df))] <- paste0(to_update, to_add)
or maybe this is clearer
names(df) <- ifelse(names(df) %in% dont_update, names(df), paste0(names(df), to_add))

Loop a sequence in R (standardize and winsorize dataframe)

I'm trying to loop this sequence of steps in r for a data frame.
Here is my data:
ID Height Weight
a 100 80
b 80 90
c na 70
d 120 na
....
Here is my code so far
winsorize2 <- function(x) {
Min <- which(x == min(x))
Max <- which(x == max(x))
ord <- order(x)
x[Min] <- x[ord][length(Min)+1]
x[Max] <- x[ord][length(x)-length(Max)]
x}
df<-read.csv("data.csv")
df2 <- scale(df[,-1], center = TRUE, scale = TRUE)
id<-df$Type
full<-data.frame(id,df2)
full[is.na(full)] <- 0
full[, -1] <- sapply(full[,-1], winsorize2)
what i'm trying to do is this:-> Standardize a dataframe, then winsorize the standardized dataframe using the function winsorize2, ie replace the most extreme values with the second least extreme value. This is then repeated 10 times. How do i do a loop for this? Im confused as in the sequence ive already replaced the nas with 0s and so i should remove this step from the loop too?
edit:After discussion with #ekstroem, we decided to change to code to introduce the boundaries
df<-read.csv("data.csv")
id<-df$Type
df2<- scale(df[,-1], center = TRUE, scale = TRUE)
df2[is.na(df2)] <- 0
df2[df2<=-3] = -3
df2[df2>=3] = 3
df3<-df2 #trying to loop again
df3<- scale(df3, center = TRUE, scale = TRUE)
df3[is.na(df3)] <- 0
df3[df3<=-3] = -3
df3[df3>=3] = 3
There are some boundary issues that are not fully specified in your code, but maybe the following can be used (using base R and not super efficient)
wins2 <- function(x, n=1) {
xx <- sort(unique(x))
x[x<=xx[n]] <- xx[n+1]
x[x>=xx[length(xx)-n]] <- xx[length(xx)-n]
x
}
This yields:
x <- 1:11
wins(x,1)
[1] 2 2 3 4 5 6 7 8 9 10 10
wins(x,3)
[1] 4 4 4 4 5 6 7 8 8 8 8

Separate text to variables in R

I have in one column of the table this:
paragemcard-resp+insufcardioresp
dpco+pneumonia
posopperfulceragastrica+ards
pos op hematoma #rim direito expontanea
miopatiaduchenne-erb+insuf.resp
dpco+dhca+#femur
posde#subtroncantГ©ricaesqВЄ+complicepidural
dpco+asma
And i want to separate them like this:
paragemcard-resp insufcardioresp
dpco pneumonia
posopperfulceragastrica ards
pos op hematoma #rim direito expontanea
miopatiaduchenne-erb insuf.resp
dpco dhca #femur
posde#subtroncantГ©ricaesqВЄ complicepidural
dpco asma
But the problem is that they don't have the same length.
As you can see, in line 3, we have 2 variable and in line 6 we have 3.
And i want to create this string in the same column for further analysis.
Thanks
You can use read.table, but you should use count.fields or some kind of regex to figure out the correct number of columns first. Using Robert's "text" sample data:
Cols <- max(sapply(gregexpr("+", text, fixed = TRUE), length))+1
## Cols <- max(count.fields(textConnection(text), sep = "+"))
read.table(text = text, comment.char="", header = FALSE,
col.names=paste0("V", sequence(Cols)),
fill = TRUE, sep = "+")
# V1 V2 V3
# 1 paragemcard-resp insufcardioresp
# 2 dpco pneumonia
# 3 posopperfulceragastrica ards
# 4 pos op hematoma #rim direito expontanea
# 5 miopatiaduchenne-erb insuf.resp
# 6 dpco dhca #femur
# 7 posde#subtroncantГ©ricaesqВЄ complicepidural
# 8 dpco asma
Also, possibly useful: the "stringi" library makes counting elements easy (as an alternative to the gregexpr step above).
library(stringi)
Cols <- max(stri_count_fixed(x, "+") + 1)
Why the need for the "Cols" step? read.table and family decides how many columns to use either by (1) the maximum number of fields detected within the first 5 rows of data or (2) the length of the col.names argument. In your example row with the most number of fields is the sixth row, so directly using read.csv or read.table would result in incorrectly wrapped data.
You can use strsplit:
text <- c("paragemcard-resp+insufcardioresp", "dpco+pneumonia", "posopperfulceragastrica+ards", "pos op hematoma #rim direito expontanea", "miopatiaduchenne-erb+insuf.resp", "dpco+dhca+#femur", "posde#subtroncantГ©ricaesqВЄ+complicepidural", "dpco+asma")
strings <- strsplit(text, "+", fixed = TRUE)
maxlen <- max(sapply(strings, length))
strings <- lapply(strings, function(s) { length(s) <- maxlen; s })
strings <- data.frame(matrix(unlist(strings), ncol = maxlen, byrow = TRUE))
and it looks like
X1 X2 X3
1 paragemcard-resp insufcardioresp <NA>
2 dpco pneumonia <NA>
3 posopperfulceragastrica ards <NA>
4 pos op hematoma #rim direito expontanea <NA> <NA>
5 miopatiaduchenne-erb insuf.resp <NA>
6 dpco dhca #femur
7 posde#subtroncantГ©ricaesqВЄ complicepidural <NA>
8 dpco asma <NA>

Resources