convert numbers in to letters in the data frame - r

How to convert the numbers of my last column into a letters in which 1=A ,2=C , 3=G , 4=T , 5=^ , 6=!
please I need help,
```
"TCCA^TA!A"
"TT^CAATTAA!C"
test <- list(c("T"="4", "C"="2", "C"="2", "A"="1", "^"="5", "T"="4", "A"="1","!"="6","A"="1"),c("T"="4","T"="4","^"="5","C"="2","A"="1","A"="1","T"="4","T"="4","A"="1","A"="1","!"="6","C"="2"))
oddfunction <- function(test) {
first_column<- test
second_column<- sort(first_column)
third_column<-paste(first_column,second_column)
fourth_column<-sort(third_column)
column_5<-paste(first_column,fourth_column)
column_6<-sort(column_5)
column_7<-paste(first_column,column_6)
column_8<-sort(column_7)
column_9<-paste(first_column,column_8)
column_10<-sort(column_9)
column_11<-paste(first_column,column_10)
df<-data.frame(first_column, second_column, third_column, fourth_column,column_5,column_6,column_7,column_8,column_9,column_10,column_11)
print(df)
}
for (i in test) {
oddfunction(i)
```

Use chartr:
chartr("123456", "ACGT^!", "1232126565")
#[1] "ACGCAC!^!^"

Related

Subsetting a data set and plotting means

I have a data set including Year, Site, and Species Count. I am trying to write a code that reflects in some years, the counts were done twice. For those years I have to find the mean count at each site for each species (there are two different species), and plot those means. This is the code I have generated:
DataSet1 <- subset(channel_islands,
channel_islands$SpeciesName=="Hypsypops ubicundus, adult" |
channel_islands$SpeciesName=="Paralabrax clathratus,adult")
years<-unique(DataSet1$Year)
Hypsypops_mean <- NULL
Paralabrax_mean <- NULL
Mean <- NULL
years <- unique(DataSet1$Year)
for(i in 1:length(years)){
data_year <- DataSet1[which(DataSet1$Year == years[i]), ]
Hypsypops<-data_year[which(data_year$SpeciesName=="Hypsypops rubicundus,adult"), ]
Paralabrax<-data_year[which(data_year$SpeciesName=="Paralabrax clathratus,adult"), ]
UNIQUESITE<-unique(unique(data_year$Site))
for(m in 1:(length(UNIQUESITE))){
zz<-Hypsypops[Hypsypops$Site==m,]
if(length(zz$Site)>=2){
Meanp <- mean(Hypsypops$Count[Hypsypops$Site==UNIQUESITE[m]])
Hypsypops_mean <- rbind(Hypsypops_mean,
c(UNIQUESITE[m], years[i], round(Meanp,2),
'Hypsypops rubicundus,adult'))
}
kk <- Paralabrax[Paralabrax$Site==m, ]
if(length(kk$Site)>=2){
Meane <- mean(Paralabrax$Count[Paralabrax$Site==UNIQUESITE[m]])
Paralabrax_mean <- rbind(Paralabrax_mean,
c(UNIQUESITE[m], years[i], round(Meane, 2),
'Paralabrax clathratus,adult'))
}
}
if(i==1){
Mean<-rbind(Hypsypops_mean, Paralabrax_mean)
}
if(i>1){
Mean<-rbind(DataMean, Hypsypops_mean, Paralabrax_mean)
}
Hypsypops_mean<-NULL
Paralabrax_mean<-NULL
}
Mean <- as.data.frame(Mean,stringsAsFactors=F)
names(Mean) <- c('Site','Year','mean_count','SpeciesName')
Mean$Site <- as.integer(Mean$Site)
Mean$Year <- as.integer(Mean$Year)
Mean$mean_count <- as.numeric(Mean$mean_count)
par(mfrow=c(5,5), oma=c(4,2,4,2), mar=c(5.5,4,3,0))
for(i in 1:length(years)){
if(any(Mean$Year==years[i])) {
year1<-Mean[which(Mean$Year==years[i]),]
Species<-unique(as.character(year1$SpeciesName))
Colors<-c("pink","purple")[Species]
Data_Hr<-year1[year1$SpeciesName=="Hypsypops rubicundus,adult",]
Data_Pc<-year1[year1$SpeciesName=="Paralabrax clathratus,adult",]
plot(Data_Hr$mean_count~Data_Pc$mean_count,
xlab=c("Hypsypops rubicundus"),
ylab=c("Paralabrax clathratus"),main=years[i],pch=16)
}
}
It's a lot I'm sorry, I'm not sure of a way to streamline the process. But I keep getting an error:
Error in names(Mean) <- c("Site", "Year", "mean_count", "SpeciesName")
: 'names' attribute [4] must be the same length as the vector [0]
Not sure how I can debug this.
Not sure why you want to do this with an elaborate loop code. It sounds like you are trying to summarise your data.
This can be done in different ways. Here is a solution using dplyr:
DataSet1 %>%
group_by(Year, SpeciesName, Site) %>%
summarise(nrecords = n(),
Count = mean(Count))
To get a better answer, it might be helpful to post a subset of the data and the intended result you are after.

How to concatenate strings with different separator every n-th element

I would like to concatenate words (strings) with different separator every 10-th element, such that each word is separated by a comma until every 10th word then it's separated by a comma and a line break. The ultimate purpose is for printing neatly a list of words into a table.
I can write a loop but I am hoping for a more elegant solution as proposed in these related questions using gsub and regular expressions:
here and here that involves inserting/replacing string after every n-th character but in my case my words have variable length (of characters).
Edit: I am looking for solution I can apply to any vector with variable number of words.
For reproducible data, I generate a vector of 40 random words using code from this source
MHmakeRandomString <- function(n, length) {
randomString <- c(1:n)
for (i in 1:n) {
randomString[i] <- paste(sample(c(0:9, letters, LETTERS), length, replace=TRUE),
collapse="")}
return(randomString)
}
set.seed(4)
word_vector <- MHmakeRandomString(n=40, length=5)
word_vector
# [1] "A0ihO" "gIUW4" "Kh6Xp" "sYAXL" "IZvuE" "PtQvw" "zeSEt" "YsCo0" "WfzbU" "5TTIz"
# [11] "oKTOO" "qaaTK" "y4QUd" "C4vNY" "lDplP" "Gjrg8" "UHzUT" "32ZcV" "c7xgl" "5Lr2H"
# [21] "fDgxt" "zFdYO" "hohuK" "vrNU4" "8oRg5" "IYcyl" "pblbO" "SHhq0" "yFjWa" "rzYLr"
# [31] "m2AXf" "QdhtM" "TWpkh" "4499K" "5Bcv8" "0DeqI" "6BdTy" "fJgKX" "tUZeh" "HPso5"
I usually do a paste(x, collapse) and then print to table using gridExtra
word_sep <- paste(word_vector, collapse=", ")
# [1] "z6LHb, 1ubB1, o9TZ2, 8s8bV, sZmcB, blirI, gMfo1, xXkkt, gFMrA, hXdaO,
# lNP2Q, p9B9G, JXTsJ, qVsWS, ntiT8, d0QRv, uoR1D, L99Bg, THWQo, meuev,
# IO0Au, 0yWmh, 72d3g, FJRDS, PtbJT, JaXVK, OPo9m, i0678, 6BpXZ, b6hzT,
# BDQBk, ANC5h, 7QPgM, JJSxf, nnX7Z, rbEfm, XXl4Z, kHMuI, wFLyM, P8rlp"
library(gridExtra)
plot_grid(tableGrob(word_sep))
Current table output: In this case I have a really long list of words and specified table width so I need line breaks.
My desired output would look like this hacked version:
word_sep2 <- paste(c(paste(MHmakeRandomString(n=10, length=5), collapse=", "), ",\n",
paste(MHmakeRandomString(n=10, length=5), collapse=", "), ",\n",
paste(MHmakeRandomString(n=10, length=5), collapse=", "), ",\n",
paste(MHmakeRandomString(n=10, length=5), collapse=", ")), collapse="")
word_sep2
# [1] "0ahiL, 2pA5c, dKWuR, 79sw5, MeL1I, KpB1w, UNLSo, LlDlN, jNOcI, tv8R5,
# \norf60, avKFo, jZFxE, U7RQW, SSmxD, czlMt, 75zEB, 2jLwG, 08dmN, H3sVW,
# \nCZwQt, ggumo, wHUpj, Z7WGR, BHYLE, eWksX, Lbt3D, P1Brf, OpEvk, 1WFVa,
# \nEeFd4, afX7B, nyBzF, vbNLz, U7MU0, H4rx4, AKgv8, Kbzri, KKajp, Yg6EW"
plot_grid(tableGrob(word_sep2))
Desired table output:
You may use
gsub("((?:[^,]*,){10}) ", "\\1\n", word_sep)
See the online regex demo.
Details
((?:[^,]*,){10}) - Group 1 (referred to with \1 from the replacement pattern) that matches 10 consecutive occurrences of
[^,]* - any 0+ chars other than ,
, - a comma
- a space
See the R demo:
MHmakeRandomString <- function(n, length) {
randomString <- c(1:n)
for (i in 1:n) {
randomString[i] <- paste(sample(c(0:9, letters, LETTERS), length, replace=TRUE),
collapse="")}
return(randomString)
}
set.seed(4)
word_vector <- MHmakeRandomString(n=40, length=5)
word_sep <- paste(word_vector, collapse=", ")
f <- gsub("((?:[^,]*,){10}) ", "\\1\n", word_sep)
cat(f, collapse="\n")
I gues you can do it with paste
paste(word_vector, rep(c(", ", ",\n"), c(9,1)), collapse = "", sep = "")
[1] "A0ihO, gIUW4, Kh6Xp, sYAXL, IZvuE, PtQvw, zeSEt, YsCo0, WfzbU, 5TTIz,\noKTOO, qaaTK, y4QUd, C4vNY, lDplP, Gjrg8, UHzUT, 32ZcV, c7xgl, 5Lr2H,\nfDgxt, zFdYO, hohuK, vrNU4, 8oRg5, IYcyl, pblbO, SHhq0, yFjWa, rzYLr,\nm2AXf, QdhtM, TWpkh, 4499K, 5Bcv8, 0DeqI, 6BdTy, fJgKX, tUZeh, HPso5,\n"
Here's what it looks like when printing it with cat:
res <- paste(word_vector, rep(c(", ", ",\n"), c(9,1)), collapse = "", sep = "")
cat(res)
# A0ihO, gIUW4, Kh6Xp, sYAXL, IZvuE, PtQvw, zeSEt, YsCo0, WfzbU, 5TTIz,
# oKTOO, qaaTK, y4QUd, C4vNY, lDplP, Gjrg8, UHzUT, 32ZcV, c7xgl, 5Lr2H,
# fDgxt, zFdYO, hohuK, vrNU4, 8oRg5, IYcyl, pblbO, SHhq0, yFjWa, rzYLr,
# m2AXf, QdhtM, TWpkh, 4499K, 5Bcv8, 0DeqI, 6BdTy, fJgKX, tUZeh, HPso5,

Change a column value based on nchar

I have the following issue: Consider a dataframe of equal length with the columns title and subtitle. title is rather clean data and subtitle is rather messy (wrong values, NAs, ...) However, when subtitle is filled in correctly, it contains a lot more information than my title variable.
I would like to replace the values in my title column where the nchar of a certain subtitle observation exceeds the nchar of the title observation.
at the moment my failed code looks like this:
#filter from real table
baseTable_sentiment <- filter(baseTable, theme_ecoFin == 1)
#with this code I try to do what I explained, while coping with the NA's in subtitle
baseTable_sentiment$title <- baseTable_sentiment$subtitle[nchar(baseTable_sentiment$subtitle , allowNA = TRUE , keepNA = TRUE) > nchar(baseTable_sentiment$title) , ]
an alternative approach to cope with the NAs
#filter from real table
baseTable_sentiment <- filter(baseTable, theme_ecoFin == 1)
#change NA to text value "na"
baseTable_sentiment$subtitle <- replace(baseTable_sentiment$subtitle,which(is.na(baseTable_sentiment$subtitle)),"na")
#same code as before
baseTable_sentiment$title <- baseTable_sentiment$subtitle[nchar(baseTable_sentiment$subtitle ) > nchar(baseTable_sentiment$title) , ]
Now when I run one of the two examples: I get the following error:
Error in
baseTable_sentiment$subtitle[(nchar(baseTable_sentiment$subtitle, :
incorrect number of dimensions
However: when i check all used dimensions
> > > length(baseTable_sentiment$subtitle) [1] 170206
> > > length(baseTable_sentiment$title) [1] 170206
> > > length(nchar(baseTable_sentiment$subtitle , allowNA = TRUE ) > nchar(baseTable_sentiment$title)) [1] 170206
How can I fix this, or do you guys have an alternative way to do this operation?
The following link contains a data example
Thank you in advance
Olivier
I found the error: the dimension of my title does not correspond anymore with the new dimension created by the condition after the assignment.
baseTable_sentiment$title[nchar(baseTable_sentiment$subtitle , allowNA = TRUE , keepNA = FALSE) > nchar(baseTable_sentiment$title) ]<- baseTable_sentiment$subtitle[nchar(baseTable_sentiment$subtitle , allowNA = TRUE , keepNA = FALSE) > nchar(baseTable_sentiment$title) ]
The dimensions are now matched and the code runs perfectly

Avoiding for loop, Naming Example

I would like to avoid using for loop in following example. Goal is to repeat string vector multiple times with different second part which changes each repetition. Is that possible?
str2D = mtcars
Vector = c(10,20)
Dimen = dim( str2D )
nn = c()
for ( i in Dimen[2]*(1:length(Vector)) ){
nn[ (i+1-Dimen[2]): i ] = rep(paste("|d",Vector[i/Dimen[2]],sep=""), Dimen[2] )
}
Name = paste( rep(names(str2D) , length(Vector) ),nn,sep="")
Correct result for "Name" vector is following:
"mpg|d10" "cyl|d10" "disp|d10" "hp|d10" "drat|d10" "wt|d10" "qsec|d10" "vs|d10" "am|d10" "gear|d10" "carb|d10" "mpg|d20" "cyl|d20" "disp|d20" "hp|d20" "drat|d20" "wt|d20" "qsec|d20" "vs|d20" "am|d20" "gear|d20" "carb|d20"
Thank you
I don't quite understand the end goal here but at least this achieves your desired output without a loop:
Name <- paste0(paste(names(mtcars)), "|d", rep(1:2, each = length(names(mtcars))), "0")
> Name
[1] "mpg|d10" "cyl|d10" "disp|d10" "hp|d10" "drat|d10" "wt|d10" "qsec|d10"
[8] "vs|d10" "am|d10" "gear|d10" "carb|d10" "mpg|d20" "cyl|d20" "disp|d20"
[15] "hp|d20" "drat|d20" "wt|d20" "qsec|d20" "vs|d20" "am|d20" "gear|d20"
[22] "carb|d20"

R loop for creating and using -csv

I have a function output (from koRpus) of the form:
Total number of tokens: 887
Total number of types: 393
Measure of Textual Lexical Diversity
MTLD: 142.66
Number of factors: 6.22
Factor size: 0.72
SD tokens/factor: 41.55 (all factors)
38 (complete factors only)
And I want to make a loop for storing these results for 80 different documents. I have tried the following:
for (i in 1:length(infra$tableid)) {
whypar <- paste(infra$whypar [infra[,1] ==i], collapse=" ")
wpi<- removeWords(whypar, stopwords("english"))
as.data.frame(wpi)
write.csv(data.frame(wpi), file= "wp.csv")
tagged.text <- tokenize("wp.csv", lang="en")
res.mtld <- MTLD(tagged.text)
write.csv(data.frame(res.mtld),file="output.csv")
}
where infra is:
tableid 1, 2, 3, ... 80
whypar "I took part because xxx", "I believe that jshfdjk", "jhsadkjhd" ... (N=350)
Thanks for any help
Extract the parts of the MTLD object you are interested in first. From your question it seems like you are only interested in a subset of the object returned by MTLD, namely the MTLD score, number of factors the SD of tokens/factor and the SD for complete factors only. If you only want these results for each file you can just write one nice table as your output for all the files:
res <- data.frame( ID = numeric() , MTLD=numeric() , Factor_Size=numeric() , SD=numeric() , SD_Complete=numeric() )
for (i in 1:length(infra$tableid)) {
whypar <- paste(infra$whypar [infra[,1] ==i], collapse=" ")
wpi<- removeWords(whypar, stopwords("english"))
wpi <- as.data.frame(wpi)
write.csv(data.frame(wpi), file= "wp.csv")
tagged.text <- tokenize("wp.csv", lang="en")
res.mtld <- MTLD(tagged.text)
mtld <- res.mtld#MTLD$MTLD
fac.size <- res.mtld#param$factor.size
mtld.sd <- res.mtld#MTLD$lengths$sd
mtld.sd.compl <- res.mtld#MTLD$lengths$sd.compl
res <- rbind( res , c( infra$tableid[i] , mtld, fac.size , mtld.sd , mtld.sd.compl ) )
}
write.csv( res , file="output.csv" )
I hope this helps, but check these are the results you want returned.

Resources