Ranking changes according to their frequency - r

I am working on a school project where I need to create many graphs based on data from an Excel table and this is one of them
D is a column in Excel
library(ggmosaic)
library(readxl)
library(dplyr)
library(ggplot2)
library(ggpubr)
######################################## - A - #######################################################
getwd()
setwd("C:/ZS (projekt)") # V případě potřeby upravte odkaz na vlastní pracovní adresář
data_ex = read_excel("C:/Users/zukov/OneDrive/Desktop/Pocitacove_hry.xlsx")
# Zjednodušme pojmenování proměnných.
colnames(data_ex)=c("ID","nazev","rok_vydaju","zanr","publisher","plat","pocitac_studium","soc_site",
"os","leg_software","ikb_znalost")
data_ex$zanr = as.factor(data_ex$zanr)
levels(data_ex$zanr)
data_ex$zanr = factor(data_ex$zanr,
levels = c("Sports/Racing","Simulation","Strategy","Role-Playing","Action","Shooter"),
labels = c("Sportovní/Závodní","Simulace","Strategie","RPG","Akční","Střílečka"))
# Určeme četnosti výskytu jednotlivých variant a uložme je pod názvem cetnosti_nom.
cetnosti_nom = table(data_ex$zanr)
cetnosti_nom # výpis absolutních četností
# V případě nominální proměnné je vhodné varianty proměnné seřadit dle jejich četností.
names(sort(table(dotaznik2$zanr),decreasing = D))
data_ex$zanr = factor(data_ex$zanr,
levels = names(sort(table(data_ex$zanr),
decreasing = TRUE)),
order = D)
cetnosti_nom = table(data_ex$zanr)
cetnosti_nom # Nyní jsou výstupem četnosti variant analyzované proměnné v požadovaném pořadí.

Related

How to create multiple loops inside a function in r

I want to obtain new values for each step and use them inside an equation for further steps. Normally, I can perform loop but for this problem, I need to use past values, too. For instance, I have flow data like this:
q<-c(10, 15.83333, 21.66667)
I created a loop manually:
z1<-190 #initial elevation
s1<-24011 #initial storage
in1<-q[1] #initial inflow
out1<-1.86*sqrt((z1-110)*19.62) #outflow
z2<- z1+0.3*((in1-out1)/s1) #elevation at second step
in2<-q[2] #second inflow
out2<-1.86*sqrt((z2-110)*19.62) #outflow at z2 elevation
ds2<-0.3*((in1+in2)/2-(out1+out2)/2) #change in storage
s2<-s1+ds2 #net storage value
z3<-z2+0.3*((in2-out2)/s2) #elevation at third step
in3<-q[3]
out3<-1.86*sqrt((z3-110)*19.62)
ds3<-0.3*((in2+in3)/2-(out2+out3)/2)
s3<-s2+ds3
.
.
.
z4<-z3+0.3*((in3-out3)/s3)
Briefly, I am calculating z value using previous values of in,out,s. What I need to find is z values considering q values.
Expected result is:
z q outflows storages
[1,] 190.0000 10.00000 73.68981 24011.00
[2,] 189.9992 15.83333 73.68944 23992.77
[3,] 189.9985 21.66667 73.68911 23976.29
I'll extend my comment here.
You can overwrite the variables. Following your implementation you could for instance create a temporal variable for your out2:
q = c(10, 15.83333, 21.66667)
#Results storage array
results = array(numeric(),c(length(q),4))
colnames(results) = c("z", "q", "outflows", "storages")
z = 190
s = 24011
infl = q[1]
out = 1.86*sqrt((z-110)*19.62)
#Save init values
results[1,1] = z
results[,2] = q
results[1,3] = out
results[1,4] = s
for (n in 2:length(q)) {
z = z+0.3*((infl-out)/s)
out_tmp = 1.86*sqrt((z-110)*19.62)
ds = 0.3*((infl+q[n])/2-(out+out_tmp)/2)
s = s+ds
infl = q[n]
out = out_tmp
results[n,1] = z
results[n,3] = out
results[n,4] = s
}
View(results)
If you want to avoid to create the temporal variable, you can try something like this:
q = c(10, 15.83333, 21.66667)
results = array(numeric(),c(length(q),4))
colnames(results) = c("z", "q", "outflows", "storages")
z = 190
s = 24011
infl = q[1]
out = 1.86*sqrt((z-110)*19.62)
#Save init values
results[1,1] = z
results[,2] = q
results[1,3] = out
results[1,4] = s
for (n in 2:length(q)) {
z = z+0.3*((infl-out)/s)
out = 1.86*sqrt((z-110)*19.62)
ds = 0.3*((infl+q[n])/2-(results[n-1,3]+out)/2)
s = s+ds
infl = q[n]
results[n,1] = z
results[n,3] = out
results[n,4] = s
}
View(results)
Init values and create a result table
Add the current state values to the table
Simulate new state using old or new states
Set the new state to all variables
library(tidyverse)
data <- tibble(step = numeric(), out = numeric(), y = numeric(), z = numeric())
# Initialization
z <- 190
y <- 1
out <- NA
for (step in seq(5)) {
# save current state
data <- data %>% add_row(step = step, out = out, z = z, y = y)
# use old state of z
new_out <- z / 2
# use old state of y
new_z <- y + 1
# use new state of out
new_y <- new_out
# Lastly, update all new variables
out <- new_out
y <- new_y
z <- new_z
}
data
#> # A tibble: 5 x 4
#> step out y z
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 NA 1 190
#> 2 2 95 95 2
#> 3 3 1 1 96
#> 4 4 48 48 2
#> 5 5 1 1 49
Created on 2021-11-10 by the reprex package (v2.0.1)

how to get total variance explained by each principal component

I have created a PCA plot using:
library(SNPRelate)
library(gdsfmt)
vcf.fn <- "input.vcf"
snpgdsVCF2GDS(vcf.fn, "test.gds", method="biallelic.only")
snpgdsSummary("test.gds")
genofile <- snpgdsOpen("test.gds")
pop_code <- read.gdsn(index.gdsn(genofile, "genotype"))
snpset <- snpgdsLDpruning(genofile, autosome.only=FALSE, ld.threshold=0.2, maf= 0.01, missing.rate=0.5)
snpset.id <- unlist(snpset)
pca <- snpgdsPCA(genofile, autosome.only=FALSE, snp.id=snpset.id, num.thread=2)
pc.percent <- pca$varprop*100
head(round(pc.percent, 2))
tab <- data.frame(sample.id = pca$sample.id,
EV1 = pca$eigenvect[,1], # the first eigenvector
EV2 = pca$eigenvect[,2], # the second eigenvector
stringsAsFactors = FALSE)
plot(tab$EV2, tab$EV1, xlab="eigenvector 2", ylab="eigenvector 1")
PCA plot looks like:
[1]: https://i.stack.imgur.com/WBeKT.png
I have created a matrix representing sample names (in rows) and first five PC's (in columns):
sample.id EV1 EV2 EV3 EV4 EV5
1 T11 -0.007433146 -0.038371106 0.079585181 0.069839389 0.12178713
2 T3 -0.014198086 0.069641911 0.006414285 -0.004750456 0.046201258
3 T10 -0.086656303 0.026455731 -0.028758639 -0.015004286 -0.007497732
4 T162 -0.00520634 0.053996842 0.021754194 -0.004660844 0.006939661
5 T163 -0.020055447 0.027697494 -0.006933852 -0.058596466 0.028236645
I want to check how much variation is explained by each PC component. Thank you for your help!

Prediction with lda in R : Warning message: 'newdata' had 1600 rows but variables found have 200 rows

I am new to R. I am trying to use lda to classify all points in a generated grid. The training set is two point groups randomly generated using rmvnorm(n,mean,sigma). Here is my code :`
# number of samples
n=100;
# parameters: G2
meanG1 = matrix(
c(2, 2), # the data elements
nrow=1, # number of rows
ncol=2, # number of columns
byrow = TRUE) # fill matrix by rows
sigmaG1 = matrix(
c(1,0,0,1), # the data elements
nrow=2, # number of rows
ncol=2, # number of columns
byrow = TRUE) # fill matrix by rows
library(mvtnorm)
# Generating a matrix G1 with norm distribution
G1 = rmvnorm(n, meanG1, sigmaG1)
G1[,3]=1
# parameters: G2
meanG2 = matrix(
c(0, 0), # the data elements
nrow=1, # number of rows
ncol=2, # number of columns
byrow = TRUE) # fill matrix by rows
sigmaG2 = matrix(
c(1,0.75,0.75,1), # the data elements
nrow=2, # number of rows
ncol=2, # number of columns
byrow = TRUE) # fill matrix by rows
# # Generating a matrix G2 with norm distribution
G2 = rmvnorm(n, meanG2, sigmaG2)
# adding a column as a label = 1 to G1 matrix
G1 = cbind(G1, 1 )
# adding a column as a label = 2 to G2 matrix
G2 = cbind(G2, 2 )
# Concatenate both matrices
G = rbind(G1,G2)
# Transforming Matrix into dataFrame
bothGroupsWithLabel <- as.data.frame(G)
# Shuffling data row-wise
bothGroupsWithLabel <- bothGroupsWithLabel[sample(nrow(bothGroupsWithLabel)),]
# plotting the generated matrices
plot(c(G1[,1]),c(G1[,2]),col="red")
points(c(G2[,1]),c(G2[,2]),col="blue")
# Generating a grid
K = 40;
seqx1 = seq(min(G1[,1]),max(G1[,1]),length = K)
seqx2 = seq(min(G1[,2]),max(G1[,2]),length = K)
myGrid = expand.grid(z1=seqx1,z2=seqx2);
plot(myGrid[,1],myGrid[,2])
library(MASS)
# Creating a model
model.lda = lda(bothGroupsWithLabel[,3] ~bothGroupsWithLabel[,1]+bothGroupsWithLabel[,2] , data = bothGroupsWithLabel);
Ypred = predict(model.lda, newdata=myGrid);
Ypredgrid = Ypred$class
Here is a part of my data bothGroupsWithLabel
V1 V2 V3
69 2.0683949 0.5779272 1
53 2.1261046 2.0420350 1
118 -1.4502033 -1.4775360 2
148 1.1705251 1.5437296 2
195 0.3100763 -0.2594026 2
40 1.8573633 3.7717020 1
and
myGrid
z1 z2
1 0.1048024 -0.2034172
2 0.2227540 -0.2034172
3 0.3407055 -0.2034172
4 0.4586571 -0.2034172
5 0.5766086 -0.2034172
6 0.6945602 -0.2034172
my grid consists of 40*40 points, hence the size of myGird data frame is 1600 rows and 2 columns. The data frame bothGroupsWithLabel consists of 200 rows and 3 columns, the first two columns are the coordinates of the points and the third column is used for labels. My problem is when I call predict(model.lda, newdata=myGrid) I get this warning message:
Warning message:
'newdata' had 1600 rows but variables found have 200 rows
what am i missing here? can anyone please help me?
The problem is the way that you generated your model. When using a formula and data=... it is better to just use the variable names. In order for this to work, you must also make the variable names match in newdata. So, when you create myGrid add the line:
names(myGrid) = c("V1", "V2")
and then make your last few lines be:
model.lda = lda(V3 ~ V1 + V2 , data = bothGroupsWithLabel);
Ypred = predict(model.lda, newdata=myGrid);
Ypredgrid = Ypred$class
That should get what you want.

specify colorRamp endpoints

Can I specify endpoints to colorRamp so that a value maps consistently to a single color, regardless of the range of other data?
I'm trying to create an interactive correlation plot in plotly. Here's some sample data.
set.seed(1)
m <- 4
cm <- matrix(runif(m**2,-1,1),
nrow=m, ncol=m,
dimnames=list(letters[1:m],letters[1:m]))
diag(cm) <- 1
cm
# a b c d
# a 1.0000000 -0.5966361 0.2582281 0.3740457
# b -0.2557522 1.0000000 -0.8764275 -0.2317926
# c 0.1457067 0.8893505 1.0000000 0.5396828
# d 0.8164156 0.3215956 -0.6468865 1.0000000
I'm basically trying to create an interactive version of this:
library(corrplot)
corrplot(cm,method='shade')
Here's the (kind of hacky) interactive correlation plot I created.
div_colors <- c('dark red','white','navy blue')
grid_labels <- matrix(paste0('Cor(',
do.call(paste,c(expand.grid(rownames(cm),colnames(cm)), sep=', ') ),
'): ',
t(round(cm,2))
),
ncol=m,byrow=TRUE)
library(plotly)
plot_ly(x = colnames(cm),
y = rownames(cm),
z = cm,
colors = colorRamp(div_colors),
type='heatmap',
hoverinfo='text',
text = grid_labels
) %>% layout(yaxis=list(autorange='reversed'))
My problem is that without forcing the colorRamp endpoints to c(-1,1), the white color doesn't match correlation of 0, and the dark red maps to the minimum observed, rather than -1.
As #rawr mentioned in a comment, the solution is to set zmin and zmax, as in:
plot_ly(x = colnames(cm),
y = rownames(cm),
z = cm,
zmin=-1, # <============
zmax=1, # <============
colors = colorRamp(div_colors),
type='heatmap',
hoverinfo='text',
text = grid_labels
) %>% layout(yaxis=list(autorange='reversed'))
Which produces the desired result. (The legend bar is shorter, presumably due to a change in default sizes in a newer version of plotly.)

Overlap plots in R - from zoo package

Using the following code:
library("ggplot2")
require(zoo)
args <- commandArgs(TRUE)
input <- read.csv(args[1], header=F, col.names=c("POS","ATT"))
id <- args[2]
prot_len <- nrow(input)
manual <- prot_len/100 # 4.3
att_name <- "Entropy"
att_zoo <- zoo(input$ATT)
att_avg <- rollapply(att_zoo, width = manual, by = manual, FUN = mean, align = "left")
autoplot(att_avg, col="att1") + labs(x = "Positions", y = att_name, title="")
With data:
> str(input)
'data.frame': 431 obs. of 2 variables:
$ POS: int 1 2 3 4 5 6 7 8 9 10 ...
$ ATT: num 0.652 0.733 0.815 1.079 0.885 ...
I do:
I would like to upload input2 which has different lenght (therefore, different x-axis) and overlap the 2 curves in the same plot (I mean overlap because I want the two curves in the same plot size, so I will "ignore" the overlapped axis labels and tittles), I would like to compare the shape, regardles the lenght of input.
First I've tried by generating toy input2 changing manual value, so that I have att_avg2 in which manual equals e.g. 7. In between original autoplot and new autoplot-2 I add par(new=TRUE), but this is not my expected output. Any hint on how doing this? Maybe it's better to save att_avg from zoo series to data.frame and not use autoplot? Thanks
UPDATE, response to G. Grothendieck:
If I do:
[...]
att_zoo <- zoo(input$ATT)
att_avg <- rollapply(att_zoo, width = manual, by = manual, FUN = mean, align = "left") #manual=4.3
att_avg2 <- rollapply(att_zoo, width = 7, by = 7, FUN = mean, align = "left")
autoplot(cbind(att_avg, att_avg2), facet=NULL) +
labs(x = "Positions", y = att_name, title="")
I get
and a warning message:
Removed 1 rows containing missing values (geom_path).
par is used with classic graphics, not for ggplot2. If you have two zoo series just cbind or merge the series together and autoplot them using facet=NULL:
library(zoo)
library(ggplot2)
z1 <- zoo(1:3) # length 3
z2 <- zoo(5:1) # length 5
autoplot(cbind(z1, z2), facet = NULL)
Note: The question omitted input2 so there could be some additional considerations from aspects not shown.

Resources