Related
My data has two variables, TRV and TRJ, and I am seeing how often they match with each other. I would like to place on a circos figure what percent each variable shows up in total on the outer layer. It should add up to 200%, 100% for TRV, 100% for TRJ.
library(circlize)
library(plyr)
df <- structure(list(TRV = structure(c(1L, 1L, 1L, 2L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 5L, 5L, 5L), .Label = c("TRAV29/DV5", "TRAV36/DV7",
"TRDV1", "TRDV2", "TRDV3", "TRGV8", "TRGV9"), class = "factor"),
TRJ = structure(c(64L, 65L, 67L, 64L, 64L, 65L, 66L, 67L,
64L, 65L, 66L, 64L, 65L, 66L), .Label = c("", "mTRAJ22",
"mTRAJ30", "mTRAJ34", "mTRAJ37", "mTRAJ45", "mTRAJ49", "mTRBJ1-1",
"mTRBJ2-5", "mTRDJ1", "mTRDJ2", "mTRGJ1", "mTRGJ4", "TRAJ10",
"TRAJ15", "TRAJ16", "TRAJ19", "TRAJ2", "TRAJ20", "TRAJ21",
"TRAJ22", "TRAJ23", "TRAJ24", "TRAJ26", "TRAJ27", "TRAJ30",
"TRAJ32", "TRAJ34", "TRAJ36", "TRAJ37", "TRAJ38", "TRAJ39",
"TRAJ40", "TRAJ41", "TRAJ42", "TRAJ43", "TRAJ44", "TRAJ45",
"TRAJ49", "TRAJ5", "TRAJ50", "TRAJ52", "TRAJ53", "TRAJ54",
"TRAJ56", "TRAJ57", "TRAJ58", "TRAJ6", "TRAJ7", "TRAJ8",
"TRBJ1-1", "TRBJ1-2", "TRBJ1-3", "TRBJ1-4", "TRBJ1-5", "TRBJ1-6",
"TRBJ2-1", "TRBJ2-2", "TRBJ2-3", "TRBJ2-4", "TRBJ2-5", "TRBJ2-6",
"TRBJ2-7", "TRDJ1", "TRDJ2", "TRDJ3", "TRDJ4", "TRGJ1", "TRGJ2",
"TRGJP", "TRGJP1", "TRGJP2"), class = "factor"), freq = c(387L,
3L, 1L, 1L, 3533L, 445L, 132L, 55L, 563L, 15L, 5L, 830L,
4L, 72L)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
10L, 11L, 12L, 13L, 14L, 15L), class = "data.frame")
grid.col = c(`TRAV29/DV5` = "red", TRDV3 = "green", TRDV2 = "blue", TRDV1 = "purple", `TRAV36/DV7` = "pink",
TRDJ4 = "orange", TRDJ1 = "palegreen", TRDJ2 = "lightsteelblue", TRDJ3 = "thistle", TRGJP = "yellow", TRGJ2 = "grey", TRGJP2 = "brown", TRGJ1 = "lightpink")
circ_plot <- function(df){
circos.par(canvas.xlim=c(-1.5,1.5),canvas.ylim=c(-1.5,1.5))
chordDiagram(df, annotationTrack = "grid",
grid.col = grid.col,
link.lwd = matrix(1, nrow = nrow(df), ncol = ncol(df)),
link.border = "black")
for(si in get.all.sector.index()) {
xlim = get.cell.meta.data("xlim", sector.index = si, track.index = 1)
ylim = get.cell.meta.data("ylim", sector.index = si, track.index = 1)
circos.text(mean(xlim),ylim[1], si, sector.index = si, track.index = 1,
facing = "clockwise",
cex=0.8,
adj=c(-1,0),
niceFacing = TRUE)
circos.axis(h = 0,
major.at = c(0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5) ,
labels.cex = 0.2,labels.facing = "inside",
sector.index = si, track.index = 1)
}
# Restart circular layout parameters
circos.clear()
}
circ_plot(df)
I am trying to use geom_label_repel to add labels to a couple of data points on a plot. In this case, they happen to be outliers on box plots. I've got most of the code working, I can label the outlier, but for some reason I am getting multiple labels (equal to my sample size for the entire data set) mapped to that point. I'd like just one label for this outlier.
Example:
Here is my data:
dput(sus_dev_data)
structure(list(time_point = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("3", "8", "12"), class = "factor"),
days_to_pupation = c(135L, 142L, 143L, 155L, 149L, 159L,
153L, 171L, 9L, 67L, 53L, 49L, 72L, 67L, 55L, 64L, 60L, 122L,
53L, 51L, 49L, 53L, 50L, 56L, 44L, 47L, 60L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 20L, 21L, 22L, 23L, 24L, 26L, 27L, 28L, 29L, 30L), class = "data.frame")
and my code...
####################################################################################################
# Time to pupation statistical analysis
####################################################################################################
## linear model
pupation_Model=lm(sus_dev_data$days_to_pupation~sus_dev_data$time_point)
pupationANOVA=aov(pupation_Model)
summary(pupationANOVA)
# Tukey test to study each pair of treatment :
pupationTUKEY <- TukeyHSD(x=pupationANOVA, which = 'sus_dev_data$time_point',
conf.level=0.95)
## Function to generate significance labels on box plot
generate_label_df <- function(pupationTUKEY, variable){
# Extract labels and factor levels from Tukey post-hoc
Tukey.levels <- pupationTUKEY[[variable]][,4]
Tukey.labels <- data.frame(multcompLetters(Tukey.levels, reversed = TRUE)['Letters'])
#I need to put the labels in the same order as in the boxplot :
Tukey.labels$treatment=rownames(Tukey.labels)
Tukey.labels=Tukey.labels[order(Tukey.labels$treatment) , ]
return(Tukey.labels)
}
#generate labels using function
labels<-generate_label_df(pupationTUKEY , "sus_dev_data$time_point")
#rename columns for merging
names(labels)<-c('Letters','time_point')
# obtain letter position for y axis using means
pupationyvalue<-aggregate(.~time_point, data=sus_dev_data, max)
#merge dataframes
pupationfinal<-merge(labels,pupationyvalue)
####################################################################################################
# Time to pupation plot
####################################################################################################
# Plot of data
(pupation_plot <- ggplot(sus_dev_data, aes(time_point, days_to_pupation)) +
Alex_Theme +
geom_boxplot(fill = "grey80", outlier.size = 0.75) +
geom_text(data = pupationfinal, aes(x = time_point, y = days_to_pupation,
label = Letters),vjust=-2,hjust=.5, size = 4) +
#ggtitle(expression(atop("Days to pupation"))) +
labs(y = 'Days to pupation', x = 'Weeks post-hatch') +
scale_y_continuous(limits = c(0, 200)) +
scale_x_discrete(labels=c("3" = "13", "8" = "18",
"12" = "22")) +
geom_label_repel(aes(x = 1, y = 9),
label = '1')
)
Here's a shorter example to demonstrate what is going on. Essentially, your labels are beng recycled to be the same length as the data.
df = data.frame(x=1:5, y=1:5)
ggplot(df, aes(x,y, color=x)) +
geom_point() +
geom_label_repel(aes(x = 1, y = 1), label = '1')
You can override this by providing new data for the ggrepel
ggplot(df, aes(x,y, color=x)) +
geom_point() +
geom_label_repel(data = data.frame(x=1, y=1), label = '1')
Based on your data, you have 3 outliers (one in each group), you can manually identify them by applying the classic definition of outliers by John Tukey (Upper: Q3+1.5*IQR and Lower: Q1-1.5*IQR) (but you are free to set your own rules to define an outlier). You can use the function quantile and IQR to get those points.
Here, I incorporated them in a sequence of pipe using dplyr package:
library(tidyverse)
Outliers <- sus_dev_data %>% group_by(time_point) %>%
mutate(Out_up = ifelse(days_to_pupation > quantile(days_to_pupation,0.75)+1.5*IQR(days_to_pupation), "Out","In"))%>%
mutate(Out_Down = ifelse(days_to_pupation < quantile(days_to_pupation,0.25)-1.5*IQR(days_to_pupation), "Out","In")) %>%
filter(Out_up == "Out" | Out_Down == "Out")
# A tibble: 3 x 4
# Groups: time_point [3]
time_point days_to_pupation Out_up Out_Down
<fct> <int> <chr> <chr>
1 3 9 In Out
2 8 122 Out In
3 12 60 Out In
As mentioned by #dww, you need to pass a new dataframe to geom_label_repel if you want your outliers to be single labeled. So, here we use the dataframe Outliers to feed the geom_label_repel function:
library(ggplot2)
library(ggrepel)
ggplot(sus_dev_data, aes(time_point, days_to_pupation)) +
#Alex_Theme +
geom_boxplot(fill = "grey80", outlier.size = 0.75) +
geom_text(data = pupationfinal, aes(x = time_point, y = days_to_pupation,
label = Letters),vjust=-2,hjust=.5, size = 4) +
#ggtitle(expression(atop("Days to pupation"))) +
labs(y = 'Days to pupation', x = 'Weeks post-hatch') +
scale_y_continuous(limits = c(0, 200)) +
scale_x_discrete(labels=c("3" = "13", "8" = "18",
"12" = "22")) +
geom_label_repel(inherit.aes = FALSE,
data = Outliers,
aes(x = time_point, y = days_to_pupation, label = "Out"))
And you get the following graph:
I hope it helps you to figure it how to label all your outliers.
I'm pretty new to R so I don't really know what I'm doing. Anyway, I have data in this format in excel (as a csv file):
dt <- data.frame(species = rep(c("a", "b", "c"), each = 4),
cover = rep(1:3, times = 4),
depth = rep(c(15, 30, 60, 90), times = 3),
stringsAsFactors = FALSE)
I want to plot a graph of cover against depth, with a different coloured line for each species, and a key for which species is which colour. I don't even know where to start.
Sorry if something similar has been asked before. Any help would be much appreciated!
Don't know if this is in a helpful format but here's some of the actual data, I need to read more about dput I think:
structure(list(species = structure(c(1L, 1L, 2L, 2L, 3L, 3L,
4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L,
11L), .Label = c("Agaricia fragilis", "bryozoan", "Dichocoenia stokesi",
"Diploria labyrinthiformis", "Diploria strigosa", "Madracis decactis",
"Manicina", "Montastrea cavernosa", "Orbicella franksi", "Porites asteroides",
"Siderastrea radians"), class = "factor"), cover = c(0.021212121,
0.04047619, 0, 0, 0, 0, 1.266666667, 4.269047619, 3.587878788,
3.25, 0.118181818, 0.152380952, 0, 0.007142857, 3.806060606,
2.983333333, 14.13030303, 15.76190476, 0.415151515, 0.2, 0.26969697,
0.135714286), depth = c(30L, 15L, 30L, 15L, 30L, 15L, 30L, 15L,
30L, 15L, 30L, 15L, 30L, 15L, 30L, 15L, 30L, 15L, 30L, 15L, 30L,
15L)), .Names = c("species", "cover", "depth"), row.names = c(NA,
22L), class = "data.frame")
Here is a solution using the ggplot2 package.
# Load packages
library(ggplot2)
# Create example data frame based on the original example the OP provided
dt <- data.frame(species = rep(c("a", "b", "c"), each = 4),
cover = rep(1:3, times = 4),
depth = rep(c(15, 30, 60, 90), times = 3),
stringsAsFactors = FALSE)
# Plot the data
ggplot(dt, aes(x = depth, y = cover, group = species, colour = species)) +
geom_line()
This should get you going!
df1 <- read.csv("//file_location.csv", headers=T)
library(dplyr)
df1 <- df1 %>% select(species, depth) %>% group_by(species) %>%
summarise(mean(depth)
library(ggplot2)
ggplot(df1, aes(x=depth, y=species, group=species, color=species) +
geom_line()
I have a bubble chart using plotly in R but the order of the axis appear to be somehow odd.
The output is as follows and you can see how the axis are not correct:
The code that I'm using is as follows
library(plotly)
library(ggplot2)
file <- c("C://link//data.csv")
#dataSource <- read.csv(file, sep =",", header = TRUE)
dataSource <- read.table(file, header=T, sep=",")
dataSource <- na.omit(dataSource)
slope <- 1
dataSource$size <- sqrt(dataSource$Y.1 * slope)
colors <- c('#4AC6B7', '#1972A4') #, '#965F8A', '#FF7070', '#C61951')
plot_ly(dataSource,
x = ~Y.1.vs.Y.2,
y = ~YTD.vs.Y.1.YTD,
color = ~BU,
size = ~size,
colors = colors,
type = 'scatter',
mode = 'markers',
sizes = c(min(dataSource$size), max(dataSource$size)),
marker = list(symbol = 'circle', sizemode = 'diameter',
line = list(width = 2, color = '#FFFFFF')),
text = ~paste('Business Unit:',
BU, '<br>Product:',
Product, '<br>Y.1.vs.Y.2:',
Y.1.vs.Y.2, '<br>YTD.vs.Y.1.YTD:',
YTD.vs.Y.1.YTD)) %>%
layout(title = 'Y.1.vs.Y.2 v. YTD.vs.Y.1.YTD',
xaxis = list(title = 'Y.1.vs.Y.2',
gridcolor = 'rgb(255, 255, 255)',
zerolinewidth = 1,
ticklen = 5,
gridwidth = 2),
yaxis = list(title = 'YTD.vs.Y.1.YTD',
gridcolor = 'rgb(255, 255, 255)',
zerolinewidth = 1,
ticklen = 5,
gridwith = 2),
paper_bgcolor = 'rgb(243, 243, 243)',
plot_bgcolor = 'rgb(243, 243, 243)')
The data is as follows:
structure(list(BU = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("B", "D"), class = "factor"), Product = structure(c(4L, 5L, 7L, 8L, 9L, 13L, 1L, 3L, 4L, 11L, 12L, 13L), .Label = c("ADT", "BHL", "CEX", "CMX", "CTL", "HTH", "MTL", "SSL", "TLS", "UTV", "WEX", "WLD", "WMX"), class = "factor"), Y.2 = c(4065L, 499L, 20L, 5491L, 781L, 53L, 34L, 1338L, 557L, 428L, 310L, 31L), Y.1 = c(4403L, 550L, 28L, 5225L, 871L, 46L, 22L, 1289L, 602L, 426L, 318L, 37L), Y.1.YTD = c(4403L, 550L, 28L, 5225L, 871L, 46L, 22L, 1289L, 602L, 426L, 318L, 37L), YTD = c(5026L, 503L, 29L, 3975L, 876L, 40L, 62L, 1395L, 717L, 423L, 277L, 35L), Y.1.vs.Y.2 = structure(c(12L, 7L, 11L, 4L, 8L, 1L, 2L, 3L, 12L, 6L, 10L, 9L), .Label = c("-13%", "-35%", "-4%", "-5%", "-76%", "0%", "10%", "12%", "19%", "3%", "40%", "8%"), class = "factor"), YTD.vs.Y.1.YTD = structure(c(8L, 5L, 11L, 3L, 7L, 2L, 9L, 12L, 10L, 1L, 2L, 4L), .Label = c("-1%", "-13%", "-24%", "-5%", "-9%", "0%", "1%", "14%", "182%", "19%", "4%", "8%"), class = "factor")), .Names = c("BU", "Product", "Y.2", "Y.1", "Y.1.YTD", "YTD", "Y.1.vs.Y.2", "YTD.vs.Y.1.YTD"), row.names = c(2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 13L, 14L, 15L), class = "data.frame", na.action = structure(c(1L, 7L, 12L), .Names = c("1", "7", "12"), class = "omit"))
Any ideas on how can I order the axis properly?
Thanks
There are a few ways to manipulate factor levels, but things can get a bit messy if you're not careful. You should familiarize yourself with ?levels and ?factor, as well as maybe ?reorder, ?relevel
In the meantime, try something like this
dataSource[[7]] <- factor(dataSource[[7]], levels = c("-76%", "-35%", "-13%", "-5%", "-4%", "0%", "3%", "8%", "10%", "12%", "19%", "40%"))
Edit
To consolidate my answer and comment...
This behaviour is caused because of the way factors are encoded. Your axes are strings and factor order is determined alphnumerically. So to change their order you have to specify it as above, or else code them numerically and give them the required names. There are many different ways to change them, in several packages. This answer provides a standard base R method for handling factors. For further info start with the manual pages I suggested.
As for it being "very manual", since factors are categorical (and therefore have a potentially arbitrary order), there is no way to automate their order unless you code them numerically in the desired order.
Thanks to the comments above I've been able to resolve the issue. Find below the full code, which I hope might help other users:
library(plotly)
library(ggplot2)
file <- c("C://link//data.csv")
dataSource <- read.table(file, header=T, sep=",")
dataSource <- na.omit(dataSource)
# Additional code to format the input values and recalculate the percentages
BUValues = dataSource$BU
ProductValues = dataSource$Product
dataSource <- as.data.frame(data.matrix(dataSource), stringsAsfactors = FALSE)
dataSource$BU = BUValues
dataSource$Product = ProductValues
dataSource$Y.1.vs.Y.2 = round((dataSource$Y.1/dataSource$Y.2 -1)*100,2)
dataSource$YTD.vs.Y.1.YTD = round((dataSource$YTD/dataSource$Y.1.YTD -1)*100,2)
slope <- 1
dataSource$size <- sqrt(dataSource$Y.1 * slope)
colors <- c('#4AC6B7', '#1972A4') #, '#965F8A', '#FF7070', '#C61951')
plot_ly(dataSource,
x = ~Y.1.vs.Y.2,
y = ~YTD.vs.Y.1.YTD,
color = ~BU,
size = ~size,
colors = colors,
type = 'scatter',
mode = 'markers',
sizes = c(min(dataSource$size), max(dataSource$size)),
marker = list(symbol = 'circle', sizemode = 'diameter',
line = list(width = 2, color = '#FFFFFF')),
text = ~paste('Business Unit:', BU,
'<br>Product:', Product,
'<br>YoY:',Y.1.vs.Y.2,
'<br>YTD:',YTD.vs.Y.1.YTD)) %>%
layout(title = 'YoY vs YTD Performance',
xaxis = list(title = 'YoY Performance (%)',
gridcolor = 'rgb(255, 255, 255)',
zerolinewidth = 1,
ticklen = 5,
gridwidth = 2),
yaxis = list(title = 'YTD Performance (%)',
gridcolor = 'rgb(255, 255, 255)',
zerolinewidth = 1,
ticklen = 5,
gridwith = 2),
paper_bgcolor = 'rgb(243, 243, 243)',
plot_bgcolor = 'rgb(243, 243, 243)')
I have several dataframes that I am applying a function to
The function works but I would like to lapply it to several dataframes and output the result according to the input names.
Here is an example of one of the dataframes
structure(list(chr = structure(c(1L, 1L, 1L), .Label = c("chr1",
"chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16",
"chr17", "chr18", "chr19", "chr2", "chr20", "chr21", "chr22",
"chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chrX",
"chrY"), class = "factor"), leftPos = c(100260254L, 100735342L,
100805662L), strand.x = structure(c(1L, 1L, 2L), .Label = c("-",
"+"), class = "factor"), X50CellJ_SLX.9395.FSeqJ.fq.gz = c(7L,
295L, 132L), Cytospongex10_SLX.9395.FSeqK.fq.gz = c(72L, 256L,
148L), FFPE20X_SLX.9395.fq.gz = c(5L, 74L, 36L), Tumour10_SMACCO_AH_088_SLX.9396.FSeqH.fq.gz = c(13L,
154L, 65L), Tumour11_SMACCO_SH_020_SLX.9396.FSeqI.fq.gz = c(1L,
0L, 0L), Tumour12_SMACCO_ED_008_SLX.9396.FSeqJ.fq.gz = c(3L,
25L, 8L), Tumour13_SMACCO_AH_086_SLX.9396.FSeqK.fq.gz = c(7L,
120L, 28L), Tumour1_SMACCO_AH_100_SLX.9396.FSeqA.fq.gz = c(0L,
0L, 0L), Tumour2_SMACCO_AH_058_SLX.9396.FSeqB.fq.gz = c(24L,
98L, 42L), Tumour3_SMACCO_SH_051_SLX.9396.FSeqC.fq.gz = c(29L,
92L, 29L), Tumour4_SMACCO_ED_031_SLX.9396.FSeqD.fq.gz = c(18L,
53L, 14L), Tumour5_SMACCO_RS_027_SLX.9396.FSeqE.fq.gz = c(8L,
93L, 17L), Tumour7_SMACCO_AH_026_SLX.9396.FSeqF.fq.gz = c(30L,
205L, 60L), Tumour9_SMACCO_ST_024_SLX.9396.FSeqG.fq.gz = c(15L,
129L, 17L), strand.y = structure(c(1L, 1L, 2L), .Label = c("-",
"+"), class = "factor"), Tumour14_SMACCO_AH_094_SLX.9394.FSeqA.fq.gz = c(0L,
7L, 3L), Tumour15_SMACCO_WG_006_SLX.9394.FSeqB..fq.gz = c(3L,
19L, 4L), Tumour16_SMACCO_ST_035_SLX.9394.FSeqC.fq.gz = c(1L,
23L, 8L), Tumour17_SMACCO_ST_034_SLX.9394.fq.gz = c(7L, 26L,
5L), Control19_SLX.9394.FSeqE.fq.gz = c(51L, 256L, 36L), Control20_SLX.9394.FSeqF.fq.gz = c(23L,
110L, 34L), Control21_SLX.9394.FSeqG..fq.gz = c(30L, 56L,
11L), Control22_SLX.9394.FSeqH.fq.gz = c(22L, 72L, 24L), Control23_SLX.9394.FSeqI.fq.gz = c(10L,
23L, 2L), Control25_SLX.9394.FSeqJ.fq.gz = c(17L, 72L, 8L),
Control27_SLX.9394.FSeqK.fq.gz = c(10L, 21L, 9L), Control28_SLX.9395.FSeqA.fq.gz = c(13L,
40L, 4L), Control29_SLX.9395.FSeqB.fq.gz = c(14L, 39L,
6L), Control30_SLX.9395.FSeqC.fq.gz = c(5L, 32L, 5L),
Control31_SLX.9395.FSeqD.fq.gz = c(7L, 11L, 5L), Control32_SLX.9395.FSeqE.fq.gz = c(5L,
32L, 4L), Control33_SLX.9395.FSeqF.fq.gz = c(10L, 25L,
6L), Control34_SLX.9395.FSeqG.fq.gz = c(3L, 32L, 1L),
Control35_SLX.9395.FSeqH.fq.gz = c(10L, 33L, 0L), Controls = c(0L,
0L, 0L), Samples = c(0L, 0L, 0L)), .Names = c("chr", "leftPos",
"strand.x", "X50CellJ_SLX.9395.FSeqJ.fq.gz", "Cytospongex10_SLX.9395.FSeqK.fq.gz",
"FFPE20X_SLX.9395.fq.gz", "Tumour10_SMACCO_AH_088_SLX.9396.FSeqH.fq.gz",
"Tumour11_SMACCO_SH_020_SLX.9396.FSeqI.fq.gz", "Tumour12_SMACCO_ED_008_SLX.9396.FSeqJ.fq.gz",
"Tumour13_SMACCO_AH_086_SLX.9396.FSeqK.fq.gz", "Tumour1_SMACCO_AH_100_SLX.9396.FSeqA.fq.gz",
"Tumour2_SMACCO_AH_058_SLX.9396.FSeqB.fq.gz", "Tumour3_SMACCO_SH_051_SLX.9396.FSeqC.fq.gz",
"Tumour4_SMACCO_ED_031_SLX.9396.FSeqD.fq.gz", "Tumour5_SMACCO_RS_027_SLX.9396.FSeqE.fq.gz",
"Tumour7_SMACCO_AH_026_SLX.9396.FSeqF.fq.gz", "Tumour9_SMACCO_ST_024_SLX.9396.FSeqG.fq.gz",
"strand.y", "Tumour14_SMACCO_AH_094_SLX.9394.FSeqA.fq.gz",
"Tumour15_SMACCO_WG_006_SLX.9394.FSeqB..fq.gz", "Tumour16_SMACCO_ST_035_SLX.9394.FSeqC.fq.gz",
"Tumour17_SMACCO_ST_034_SLX.9394.fq.gz", "Control19_SLX.9394.FSeqE.fq.gz",
"Control20_SLX.9394.FSeqF.fq.gz", "Control21_SLX.9394.FSeqG..fq.gz",
"Control22_SLX.9394.FSeqH.fq.gz", "Control23_SLX.9394.FSeqI.fq.gz",
"Control25_SLX.9394.FSeqJ.fq.gz", "Control27_SLX.9394.FSeqK.fq.gz",
"Control28_SLX.9395.FSeqA.fq.gz", "Control29_SLX.9395.FSeqB.fq.gz",
"Control30_SLX.9395.FSeqC.fq.gz", "Control31_SLX.9395.FSeqD.fq.gz",
"Control32_SLX.9395.FSeqE.fq.gz", "Control33_SLX.9395.FSeqF.fq.gz",
"Control34_SLX.9395.FSeqG.fq.gz", "Control35_SLX.9395.FSeqH.fq.gz",
"Controls", "Samples"), row.names = c(NA, 3L), class = "data.frame")
Here is what I have so far
mylist <- list(A = OriginalMeta , B = SLX9392 , C = SLX9393, D = SLX9397, E = Gastric, F = Dysplasia, G = GoodDysplasia, H = Cholangio, I = LCM_PS14_1105_1F)
sortIt <- function(df1) {
df1$strand.x<- NULL
df1$strand.y<- NULL
df1$strand<-NULL
df1$X.<-NULL
names(df1)[1] <- c("chr")
#Get rid of X and Y chromosomes
df1 <- df1[!grepl("chrX", df1$chr), ]
df1 <- df1[!grepl("chrY", df1$chr), ]
xyAss3<-df1
return(xyAss3)
}
lapply(names(mylist),
sortIt(x)write.csv(mylist[x],
file =paste0(x,'.csv')))
The thing is I just dont know how to feed the mylist into the function. Should I call x in the lapply df1? I'm a bit confused as to how to tie it all together.
I think you'll do better to fold the creation of the .csv into your function and then use a for loop to apply that function to each object in your list in turn. So something like this, where df is the sample data frame you posted:
mylist <- list(A = df, B = df)
sortIt <- function(i) {
df = mylist[[i]]
df[,"strand.x"] <- NULL
df[,"strand.y"] <- NULL
df[,"strand"] <- NULL
df[,"X."] <- NULL
names(df) <- c("chr", names(df)[2:length(names(df))])
df <- df[!grepl("chrX", df$chr), ]
df <- df[!grepl("chrY", df$chr), ]
write.csv(df, file = paste0(names(mylist)[i], ".csv"), row.names=FALSE)
}
for (i in seq(length(mylist))) {sortIt(i)}
If you were trying to create a new object in your workspace, then one of the apply functions would be a better bet. But when you're trying to output files, I think you need to use a for loop instead.
Not really sure what you are trying to achieve, but guessing that you want to save the transformed data frame to a file with a name taken from the list, this could do the job (it should work with the rest of your code - note the [[1]]):
lapply(names(mylist),
function(x) write.csv(sortIt(mylist[x][[1]]),
file = paste0(x,'.csv')))
Another option is to use mapply, here I'm attaching a complete example:
# create the data
dframes <- lapply(1:3, function(x) data.frame(x=rnorm(10), y=runif(10)))
names(dframes) <- LETTERS[1:3]
# the transformation function
sortdf <- function(df) df[order(df$x),]
# two variants of apply
lapply(names(dframes),
function(name) write.csv(sortdf(dframes[name][[1]]),
file=paste0(name, '.csv')))
# mapply does not have the ugly [[1]] syntax bit, I'd prefer it myself
mapply(function(name, df) write.csv(sortdf(df), file=paste0(name, '.csv')),
names(dframes),
dframes)