Isolating elements from a specific column using R - r
I've been experimenting with the TikTok Scraper tool ( for a little while and, while I'm not a expert, I've been using R for some of the more simples analysis of the scraped data, but I've experiencing issues with the hashtags scraping.
Whenever I scrape data from, let's say, an specific hashtag, I receive data in a format that's not very suitable for my work, for instance:
All that interests to me are the terms that come after "name": — that is, 'globolixoo' and 'mentira'.
Is there any way in which I can isolate these terms and separate them by comma (globolixoo,mentira)?
Reproducible code example:
dput(head(globolixoo, 10)
structure(list(id = c(6808536063938710528, 6814233256737786880,
6825734509393103872, 6945455970969488384, 6949635086916635648,
6970340938765978624, 6971908200639630336, 6973074032547613696,
6973112184809212928, 6973333129226505216), secretID = c(6808536063938710528,
6814233256737786880, 6825734509393103872, 6945455970969488384,
6949635086916635648, 6970340938765978624, 6971908200639630336,
6973074032547613696, 6973112184809212928, 6973333129226505216
), webVideoUrl = c("",
), text = c("#GloboLixoo", "#globolixoo", "#GLOBOLIXOO 🤮🤮",
"#GloboLixoo", "#globolixoo", "#globolixoo #mentira kkkkkkk",
"E assim foi a minha chegada aqui... valeu galera ❤️ #lulaxbolsonaro #globolixoo x #bolsonaro2022 #foryou #fory #fyyy #golpista #obrigadogalera",
"Responder a #luizguilhermeol2 #flamengooooo #paravocefo #dublagem19 #flamengosergipe #copaamerica #flamengo #flamengolibertadores #Eriksen #globolixoo",
"Responder a #vinicius.almeida829 #globolixoo #flamengolibertadores #flamengo #copaamerica #flamengosergipe #dublagem19 #neymar #globol",
"#neymar #Eriksen #globolixoo #flamengolibertadores #dublagem19 #flamengosergipe #copaamerica #foryoupage #paravoce #globolixooo #flamengo #fyyy"
), createTime = c(1585235836L, 1586562318L, 1589240162L, 1617114988L,
1618088013L, 1622908971L, 1623273877L, 1623545319L, 1623554204L,
1623605644L), date = c("26/03/2020", "10/04/2020", "11/05/2020",
"30/03/2021", "10/04/2021", "05/06/2021", "09/06/2021", "13/06/2021",
"13/06/2021", "13/06/2021"), = c(6807209790076404736,
6808216797218505728, 6822488437678539776, 6573364480217792512,
6949596571051525120, 6801837537368605696, 6817099676148990976,
6890609472302023680, 6890609472302023680, 6890609472302023680
), authorMeta.secUid = c("MS4wLjABAAAAUCftiaqGW7kGk4tZlB2socpml7caR7G1BWDVPCMoZeYPYxujhgejBN79QwvG47Ux",
), = c("aguiarsillvabarbe", "deysetavares77",
"davinunesrocha1988", "fernandomedeiros84", "marcoaaspm", "diverticity",
"erick.castrooficial", "dublagem19", "dublagem19", "dublagem19"
), authorMeta.nickName = c("Aguiar Sillva Barbei", "Kayque Jesus",
"Davi Nunes Rocha", "Fernando Medeiros", "user9898887549048",
"diverticity", "Erick Rosário", "Ramon Santos", "Ramon Santos",
"Ramon Santos"), authorMeta.verified = c(FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), authorMeta.signature = c("",
"me seguem no tik Tok gentee meu Instagram é #deyse3760", "",
"", "Sou um cara legal que adora tomar umas cervejas e fazer um bom churrasco...",
"", "Jornalista | Pregador do Evangelho\n\nEX GLOBAL\nInsta:#erick_castrooficial",
"muito obrigado pelo 90k 🙏🙏🙏🔥", "muito obrigado pelo 90k 🙏🙏🙏🔥",
"muito obrigado pelo 90k 🙏🙏🙏🔥"), authorMeta.avatar = c("",
), authorMeta.following = c(36L, 91L, 86L, 2091L, 17L, 103L,
19L, 0L, 0L, 0L), = c(23L, 54L, 61L, 1741L, 5L,
9L, 14900L, 103800L, 103800L, 103800L), authorMeta.heart = c(87L,
536L, 5L, 9168L, 2L, 67L, 149400L, 1800000L, 1800000L, 1800000L
), = c(13L, 90L, 1L, 78L, 1L, 7L, 45L, 1388L,
1388L, 1388L), authorMeta.digg = c(27L, 101L, 772L, 6976L, 429L,
203L, 202L, 1964L, 1964L, 1964L), musicMeta.musicId = c(6808524996944628736,
6814115937994755072, 6817770512362654720, 6945455736818371584,
6949634983900351488, 6970340831324605440, 6971908113897098240,
6973073959780731904, 6.973112023635e+18, 6973333140853115904),
musicMeta.musicName = c("som original", "som original", "som original",
"som original", "som original", "som original", "som original",
"som original", "som original", "som original"), musicMeta.musicAuthor = c("Aguiar Sillva Barbei",
"Tik Toker", "Lennon Rikelme", "Fernando Medeiros", "user9898887549048",
"diverticity", "Erick Rosário", "Ramon Santos", "Ramon Santos",
"Ramon Santos"), musicMeta.musicOriginal = c(TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE), musicMeta.musicAlbum = c("",
"", "", "", "", "", "", "", "", ""), musicMeta.playUrl = c("",
), musicMeta.coverThumb = c("",
), musicMeta.coverMedium = c("",
), musicMeta.coverLarge = c("",
), musicMeta.duration = c(60L, 10L, 14L, 24L, 18L, 59L, 14L,
180L, 180L, 180L), covers.default = c("",
), covers.origin = c("",
), covers.dynamic = c("",
), videoUrl = c("",
), videoUrlNoWaterMark = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), videoApiUrlNoWaterMark = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), videoMeta.height = c(480L, 1024L, 1024L,
480L, 576L, 1024L, 1024L, 1024L, 1024L, 1024L), videoMeta.width = c(848L,
576L, 576L, 848L, 1024L, 576L, 576L, 576L, 576L, 576L), videoMeta.duration = c(60L,
4L, 14L, 24L, 18L, 59L, 14L, 180L, 180L, 180L), diggCount = c(2L,
6L, 5L, 17L, 2L, 3L, 33L, 30200L, 23900L, 663L), shareCount = c(0L,
0L, 0L, 9L, 0L, 0L, 0L, 1540L, 468L, 37L), playCount = c(30L,
57L, 74L, 385L, 23L, 24L, 2783L, 930500L, 430600L, 34600L
), commentCount = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 224L, 205L,
16L), downloaded = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE), mentions = c("[]", "[]", "[]",
"[]", "[]", "[]", "[]", "[\"#luizguilhermeol2\"]", "[\"#vinicius\"]",
"[]"), hashtags = c("[{\"id\":\"1662240138893317\",\"name\":\"globolixoo\",\"title\":\"\",\"cover\":\"\"}]",
), effectStickers = c("[]", "[{\"id\":\"333023\",\"name\":\"Humor de feriado\"}]",
"[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]")), row.names = c(NA,
10L), class = "data.frame")
UpdateII removed previous solutions:
df %>%
mutate(id = row_number()) %>%
separate_rows(hashtags, sep = '\",\"name\":\"') %>%
separate_rows(hashtags, sep = '\",\"title\"') %>%
filter(str_detect(hashtags, '^[\\w]')) %>%
group_by(id) %>%
summarise(hashtags = toString(hashtags), .groups = "drop") %>%
1 globolixoo
2 globolixoo
3 globolixoo
4 globolixoo
5 globolixoo
6 globolixoo, mentira
7 lulaxbolsonaro, globolixoo, bolsonaro2022, foryou, fory, fyyy, golpista, obrigadogalera
8 flamengooooo, paravocefo, dublagem19, flamengosergipe, copaamerica, flamengo, flamengolibertado…
9 globolixoo, flamengolibertadores, flamengo, copaamerica, flamengosergipe, dublagem19, neymar, g…
10 neymar, eriksen, globolixoo, flamengolibertadores, dublagem19, flamengosergipe, copaamerica, fo…
how to add conditional statement when doing double legend
I try to make a plot with legend for both horizontal lines and vertical lines. Now I would like to test out whether I can add if statement base on the event status. For the df with both Delay and Sick, my codes works. But if I want to modify the my plotting part so I can use it on a df that might only have Delay or Sick, what should I with my geom_vline and scale_linetype_manualpart? for example, if I want to use my codes on df2. df<-structure(list(Day = c(0L, 0L, 0L, 1L, 1L, 1L, 8L, 8L, 8L, 15L, 15L, 15L, 22L, 22L, 22L, 27L, 29L, 29L, 29L, 36L, 36L, 36L, 43L, 43L, 43L, 43L, 43L, 43L), Subject = c("ELA", "Math", "Art", "Math", "Art", "ELA", "ELA", "Math", "Art", "ELA", "Math", "Art", "ELA", "Math", "Art", NA, "ELA", "Math", "Art", "ELA", "Math", "Art", "Art", "Art", "Math", "Math", "ELA", "ELA"), Score = c(73L, 157L, 75L, 111L, 82L, 69L, 78L, 131L, 93L, 58L, 109L, 99L, 79L, 131L, 84L, NA, 67L, 106L, 90L, 75L, 123L, 95L, 122L, 122L, 137L, 137L, 83L, 83L), Event = c(NA, NA, NA, "Delay", "Delay", "Delay", NA, NA, NA, NA, NA, NA, NA, NA, NA, "Sick", NA, NA, NA, NA, NA, NA, "Sick", "Delay", "Sick", "Delay", "Sick", "Delay")), class = "data.frame", row.names = c(NA, -28L)) ggplot(data =df)+ geom_line(data=df[!$Score),],aes(x = Day, y = Score, color=Subject),size=0.8)+ scale_colour_manual(breaks = c("ELA", "Math", "Art"), values=c(ELA="#cc0022",Math="#70ad47", Art="#fd9300"))+ geom_vline(data=df[(!$Event)&df$Event=="Delay"),], aes(xintercept=jitter(Day), linetype="Delay"), color="black", size=0.4)+ geom_vline(data=df[(!$Event)&df$Event=="Sick"),], aes(xintercept=jitter(Day), linetype="Sick"), color="purple", size=0.4)+ scale_linetype_manual(name = 'Event', values = c('Delay' = 1, 'Sick' = 1), guide = guide_legend(override.aes = list(colour = c("black", "purple")))) df2 <-structure(list(Day = c(0L, 0L, 0L, 1L, 1L, 1L, 8L, 8L, 8L, 15L, 15L, 15L, 22L, 22L, 22L, 27L, 29L, 29L, 29L, 36L, 36L, 36L, 43L, 43L, 43L, 43L, 43L, 43L), Subject = c("ELA", "Math", "Art", "Math", "Art", "ELA", "ELA", "Math", "Art", "ELA", "Math", "Art", "ELA", "Math", "Art", NA, "ELA", "Math", "Art", "ELA", "Math", "Art", "Art", "Art", "Math", "Math", "ELA", "ELA"), Score = c(73L, 157L, 75L, 111L, 82L, 69L, 78L, 131L, 93L, 58L, 109L, 99L, 79L, 131L, 84L, NA, 67L, 106L, 90L, 75L, 123L, 95L, 122L, 122L, 137L, 137L, 83L, 83L), Event = c(NA, NA, NA, "Delay", "Delay", "Delay", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Delay", NA, "Delay")), class = "data.frame", row.names = c(NA, -28L)) I am thinking using sth like this (they don't work) df<-df2 ggplot(data =df)+ geom_line(data=df[!$Score),],aes(x = Day, y = Score, color=Subject),size=0.8)+ scale_colour_manual(breaks = c("ELA", "Math", "Art"), values=c(ELA="#cc0022",Math="#70ad47", Art="#fd9300"))+ {if (grepl("Delay", df$Event)) geom_vline(data=df[(!$Event)&df$Event=="Delay"),], aes(xintercept=jitter(Day), linetype="Delay"), color="black", size=0.4)}+ {if (grepl("Sick", df$Event)) geom_vline(data=df[(!$Event)&df$Event=="Sick"),], aes(xintercept=jitter(Day), linetype="Sick"), color="purple", size=0.4)}+ scale_linetype_manual(name = 'Event', values = c('Delay' = 1, 'Sick' = 1), guide = guide_legend(override.aes = list(colour = c("black", "purple")))) Code chunk 3: ggplot(data =df)+ geom_line(data=df[!$Score),],aes(x = Day, y = Score, color=Subject),size=0.8)+ scale_colour_manual(breaks = c("ELA", "Math", "Art"), values=c(ELA="#cc0022",Math="#70ad47", Art="#fd9300"))+ geom_vline(data=df[(!$Event)&df$Event=="Delay"),], aes(xintercept=jitter(Day),linetype="Delay"), color="black", size=0.4)+ # geom_vline(data=df[(!$Event)&df$Event=="Sick"),], aes(xintercept=jitter(Day) ), color="purple", size=0.4)+ scale_linetype_manual(name = 'Event', values = c( "Delay" = 1, "Sick" = 1 ), guide = guide_legend(override.aes = list(colour = c("black", "purple"))))
With using an if to add the layers you are on the right track. Instead of putting the conditions inside the ggplot code personally I prefer to setup the conditional layers outside of ggplot code and best to put everything inside a function. Doing so, one option to achieve your desired result may look like so: EDIT Additionally, instead of using the hack via the linetype aes to get a separate legend you could use the ggnewscale package to add a second color legend. One benefit is that we need no fiddling via override.aes and no additional conditioning to manage the different cases: library(ggplot2) plot_fun <- function(df) { is_delay <- !$Event) & df$Event == "Delay" is_sick <- !$Event) & df$Event == "Sick" layer_delay <- if (any(is_delay)) geom_vline(data = df[is_delay, ], aes(xintercept = jitter(Day), color = "Delay"), size = 0.4) layer_sick <- if (any(is_sick)) geom_vline(data = df[is_sick, ], aes(xintercept = jitter(Day), color = "Sick"), size = 0.4) ggplot(data = df) + geom_line(data = df[!$Score), ], aes(x = Day, y = Score, color = Subject), size = 0.8) + scale_colour_manual( breaks = c("ELA", "Math", "Art"), values = c(ELA = "#cc0022", Math = "#70ad47", Art = "#fd9300"), ) + ggnewscale::new_scale_color() + layer_delay + layer_sick + scale_colour_manual( name = "Event", values = c(Delay = "black", Sick = "purple"), limits = force ) } plot_fun(df2)
Ggplot2 doesn't scale Y axis
I'm having a problem regarding scaling Y axis on the ggplot2. I have a dataset (dane_dlugie) consisted of dates (Data) and river flow observations at three different spots (Osielec, Jordanów and Skawica Dolna). My goal is to plot flows from Osielec regarding the proper date. My plot code looks like this: ggplot(dane_dlugie, aes(x=Data, y=Osielec, group=1)) + geom_line()+labs(x="Data", y="Flow") + ggtitle("Osielec")+ scale_x_datetime(date_breaks = "1 day", date_labels = "%d-%m") On the Y axis, I would like to have flow scale with the break of 1 m3/s. I've tried using the 'scale_y_discrete' however without any success. Could anyone help me with that? Please find below the reproducible example. dane_dlugie <- structure(list(Data = structure(list(sec = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), min = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), hour = c(0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 0L), mday = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), mon = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), year = c(110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L, 110L), wday = c(6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 0L), yday = c(120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 121L), isdst = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), zone = c("CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST", "CEST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon", "year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), Osielec = c("1.281", "1.294", "1.294", "1.281", "1.268", "1.281", "1.294", "1.333", "1.32", "1.32", "1.333", "1.307", "1.333", "1.346", "1.346", "1.359", "1.32", "1.32", "1.294", "1.5328", "2.0296", "2.1952", "2.7541", "4.1775", "4.5983"), Jordanów = structure(c(124L, 124L, 118L, 115L, 108L, 108L, 108L, 115L, 103L, 111L, 127L, 120L, 122L, 120L, 116L, 125L, 122L, 111L, 122L, 206L, 258L, 236L, 234L, 266L, 281L), .Label = c("", "0,1672", "0,1696", "0,172", "0,1744", "0,1768", "0,1792", "0,1816", "0,184", "0,1864", "0,1888", "0,1912", "0,1936", "0,196", "0,1984", "0,2008", "0,2032", "0,2056", "0,208", "0,2104", "0,2128", "0,2152", "0,2176", "0,22", "0,2224", "0,2248", "0,2272", "0,2296", "0,232", "0,2344", "0,2368", "0,2392", "0,2416", "0,244", "0,2464", "0,2488", "0,2512", "0,2536", "0,256", "0,2584", "0,2608", "0,2632", "0,2656", "0,268", "0,2704", "0,2728", "0,2752", "0,2776", "0,28", "0,2824", "0,2848", "0,2872", "0,2896", "0,2944", "0,2968", "0,2992", "0,3016", "0,304", "0,3064", "0,3088", "0,3112", "0,3136", "0,316", "0,3184", "0,3208", "0,3232", "0,3256", "0,3264", "0,328", "0,3304", "0,3328", "0,333", "0,3352", "0,3354", "0,3362", "0,3376", "0,34", "0,3424", "0,3448", "0,3472", "0,3496", "0,3497", "0,352", "0,3544", "0,3568", "0,3588", "0,3592", "0,3616", "0,3623", "0,364", "0,3664", "0,3688", "0,3712", "0,3736", "0,376", "0,3784", "0,3808", "0,3832", "0,3856", "0,388", "0,3904", "0,3928", "0,394", "0,3952", "0,3976", "0,4", "0,4042", "0,4076", "0,4084", "0,4126", "0,4144", "0,4168", "0,4178", "0,421", "0,4212", "0,4246", "0,4252", "0,428", "0,4294", "0,4314", "0,4336", "0,4348", "0,4378", "0,4382", "0,4416", "0,442", "0,445", "0,4462", "0,4484", "0,4504", "0,4518", "0,4546", "0,4552", "0,4588", "0,463", "0,4672", "0,4714", "0,4756", "0,4798", "0,484", "0,4882", "0,4924", "0,4966", "0,5008", "0,505", "0,5092", "0,5134", "0,5176", "0,5218", "0,526", "0,5302", "0,5344", "0,5386", "0,5428", "0,547", "0,5512", "0,5554", "0,5596", "0,5638", "0,568", "0,5722", "0,5764", "0,5806", "0,5848", "0,589", "0,5932", "0,5974", "0,6016", "0,6058", "0,61", "0,6116", "0,6142", "0,6184", "0,6226", "0,6268", "0,631", "0,6352", "0,6394", "0,6436", "0,6478", "0,652", "0,6562", "0,6604", "0,6646", "0,6688", "0,673", "0,6772", "0,6814", "0,6856", "0,6898", "0,694", "0,6982", "0,7024", "0,7066", "0,7108", "0,715", "0,7192", "0,7234", "0,7276", "0,7318", "0,736", "0,7402", "0,7444", "0,7486", "0,7528", "0,7556", "0,757", "0,7584", "0,7612", "0,7654", "0,7696", "0,7738", "0,778", "0,7822", "0,7864", "0,7906", "0,7948", "0,799", "0,8032", "0,8074", "0,8116", "0,8158", "0,82", "0,8258", "0,8316", "0,8374", "0,8432", "0,849", "0,8548", "0,8606", "0,8664", "0,8722", "0,878", "0,8804", "0,8838", "0,8856", "0,8896", "0,8954", "0,9012", "0,907", "0,9128", "0,9186", "0,9244", "0,9302", "0,936", "0,9418", "0,9476", "0,9534", "0,9592", "0,965", "0,9708", "0,9766", "0,9824", "0,9882", "0,9916", "0,994", "0,9998", "1,0052", "1,0056", "1,0064", "1,0114", "1,0172", "1,023", "1,0288", "1,0346", "1,0364", "1,0404", "1,0462", "1,052", "1,0578", "1,0636", "1,0694", "1,0752", "1,081", "1,0868", "1,0926", "1,0984", "1,1042", "1,11", "1,1158", "1,1196", "1,1216", "1,1248", "1,1274", "1,1332", "1,139", "1,1448", "1,1506", "1,156", "1,1564", "1,1622", "1,1664", "1,168", "1,1738", "1,1796", "1,1854", "1,1912", "1,197", "1,2028", "1,2086", "1,2144", "1,2184", "1,2202", "1,2236", "1,226", "1,2288", "1,2318", "1,234", "1,2376", "1,2492", "1,2496", "1,2548", "1,255", "1,26", "1,2608", "1,2666", "1,2724", "1,2754", "1,2782", "1,284", "1,2898", "1,2908", "1,2956", "1,2985", "1,3014", "1,3072", "1,313", "1,3139", "1,3188", "1,3246", "1,3293", "1,3304", "1,3362", "1,337", "1,3415", "1,342", "1,3478", "1,3536", "1,3594", "1,3652", "1,3678", "1,371", "1,3755", "1,3768", "1,3826", "1,3884", "1,3942", "1,4", "1,408", "1,414", "1,416", "1,4217", "1,424", "1,432", "1,4371", "1,44", "1,4448", "1,448", "1,4525", "1,4602", "1,464", "1,4679", "1,472", "1,4756", "1,48", "1,4833", "1,488", "1,491", "1,496", "1,4987", "1,5064", "1,512", "1,5141", "1,52", "1,528", "1,536", "1,5372", "1,544", "1,5449", "1,552", "1,5526", "1,56", "1,5603", "1,568", "1,5757", "1,576", "1,5834", "1,584", "1,5911", "1,592", "1,6065", "1,608", "1,616", "1,6219", "1,624", "1,632", "1,64", "1,648", "1,6527", "1,664", "1,672", "1,68", "1,6835", "1,688", "1,6912", "1,696", "1,6989", "1,704", "1,7066", "1,712", "1,7143", "1,7297", "1,736", "1,744", "1,7451", "1,752", "1,7528", "1,76", "1,768", "1,776", "1,7836", "1,784", "1,792", "1,799", "1,8", "1,8067", "1,816", "1,8221", "1,824", "1,8298", "1,832", "1,8375", "1,84", "1,8452", "1,848", "1,8529", "1,856", "1,864", "1,872", "1,876", "1,888", "1,8914", "1,896", "1,8991", "1,904", "1,9068", "1,912", "1,92", "1,9222", "1,928", "1,936", "1,9376", "1,944", "1,9453", "1,952", "1,953", "1,96", "1,968", "1,9684", "1,976", "1,984", "1,9915", "1,992", "10,0136", "10,014", "10,035", "10,098", "10,14", "10,203", "10,371", "10,434", "10,455", "10,518", "10,539", "10,56", "10,833", "10,854", "10,875", "10,938", "10,959", "11,064", "11,106", "11,169", "11,184", "11,211", "11,232", "11,274", "11,337", "11,358", "11,444", "11,576", "11,664", "11,686", "11,862", "11,884", "12,104", "12,214", "12,236", "12,368", "12,434", "12,456", "12,72", "12,786", "12,918", "12,94", "13,028", "13,05", "13,072", "13,094", "13,182", "13,27", "13,314", "13,325", "13,424", "13,446", "13,578", "13,715", "13,991", "14,014", "14,037", "14,083", "14,152", "14,29", "14,451", "14,497", "14,635", "14,727", "14,819", "14,957", "15,003", "15,164", "15,21", "15,225", "15,375", "15,417", "15,44", "15,601", "15,67", "15,805", "15,808", "16,125", "16,15", "16,183", "16,291", "16,35", "16,399", "16,45", "16,65", "16,725", "16,925", "17,15", "17,225", "17,475", "17,775", "17,938", "18,1", "18,2", "18,325", "18,35", "18,562", "18,778", "18,886", "19,021", "19,048", "19,102", "19,237", "19,453", "19,588", "19,696", "19,885", "19,912", "2", "2,0069", "2,008", "2,0146", "2,016", "2,024", "2,032", "2,04", "2,0494", "2,056", "2,064", "2,0688", "2,072", "2,08", "2,088", "2,0882", "2,096", "2,104", "2,1076", "2,112", "2,12", "2,127", "2,128", "2,136", "2,1367", "2,144", "2,152", "2,16", "2,1658", "2,168", "2,176", "2,184", "2,1852", "2,192", "2,2", "2,21", "2,2143", "2,22", "2,224", "2,23", "2,2337", "2,24", "2,2434", "2,25", "2,2531", "2,26", "2,27", "2,2725", "2,28", "2,29", "2,3", "2,3016", "2,31", "2,3113", "2,32", "2,321", "2,33", "2,3404", "2,35", "2,3598", "2,36", "2,3695", "2,37", "2,3889", "2,39", "2,3986", "2,4", "2,41", "2,418", "2,4277", "2,43", "2,4374", "2,44", "2,4471", "2,45", "2,4568", "2,46", "2,47", "2,4762", "2,48", "2,4859", "2,49", "2,5053", "2,51", "2,515", "2,52", "2,5247", "2,54", "2,5441", "2,55", "2,56", "2,5635", "2,58", "2,5829", "2,59", "2,5926", "2,6", "2,61", "2,62", "2,63", "2,64", "2,6411", "2,65", "2,6508", "2,66", "2,67", "2,6799", "2,68", "2,69", "2,6993", "2,7", "2,71", "2,7187", "2,72", "2,7284", "2,73", "2,7381", "2,7478", "2,75", "2,7575", "2,76", "2,7672", "2,78", "2,7866", "2,79", "2,7963", "2,8", "2,81", "2,82", "2,8254", "2,83", "2,8351", "2,84", "2,8448", "2,85", "2,8545", "2,86", "2,8642", "2,87", "2,8739", "2,88", "2,89", "2,8933", "2,9", "2,91", "2,9127", "2,92", "2,9224", "2,93", "2,94", "2,9418", "2,95", "2,9515", "2,96", "2,97", "2,98", "2,9806", "2,99", "2,9903", "20,101", "20,155", "20,182", "20,217", "20,506", "20,695", "21,43", "21,52", "21,713", "21,94", "22,03", "22,09", "22,27", "22,63", "22,78", "23,015", "23,046", "23,14", "23,59", "23,65", "23,98", "24,298", "24,595", "24,727", "24,958", "25,189", "25,42", "26,113", "26,179", "26,641", "26,74", "26,872", "26,971", "27,616", "27,832", "28,336", "29,668", "29,956", "3", "3,01", "3,011", "3,02", "3,022", "3,03", "3,033", "3,04", "3,044", "3,05", "3,055", "3,06", "3,07", "3,077", "3,08", "3,088", "3,09", "3,099", "3,1", "3,11", "3,13", "3,14", "3,143", "3,15", "3,154", "3,16", "3,165", "3,17", "3,176", "3,18", "3,19", "3,198", "3,2", "3,209", "3,212", "3,224", "3,242", "3,248", "3,253", "3,26", "3,264", "3,272", "3,275", "3,284", "3,286", "3,296", "3,297", "3,308", "3,319", "3,32", "3,33", "3,332", "3,341", "3,344", "3,352", "3,363", "3,368", "3,38", "3,392", "3,396", "3,404", "3,407", "3,416", "3,428", "3,429", "3,44", "3,451", "3,452", "3,462", "3,473", "3,488", "3,5", "3,506", "3,512", "3,517", "3,524", "3,536", "3,539", "3,548", "3,56", "3,561", "3,572", "3,583", "3,594", "3,596", "3,605", "3,608", "3,616", "3,62", "3,632", "3,638", "3,649", "3,656", "3,668", "3,68", "3,682", "3,692", "3,693", "3,704", "3,715", "3,716", "3,728", "3,74", "3,752", "3,759", "3,764", "3,776", "3,788", "3,792", "3,8", "3,812", "3,814", "3,824", "3,825", "3,836", "3,847", "3,86", "3,872", "3,884", "3,891", "3,896", "3,908", "3,932", "3,956", "3,968", "3,98", "3,99", "3,992", "30,028", "30,532", "30,676", "30,892", "31,533", "31,798", "31,943", "32,189", "32,763", "32,927", "33,009", "33,173", "33,296", "33,46", "33,583", "33,73", "33,788", "34,948", "35,241", "35,57", "36,181", "36,369", "37,027", "37,121", "37,5", "38,202", "38,343", "38,437", "38,484", "38,531", "39,001", "39,236", "39,659", "39,753", "4,001", "4,004", "4,028", "4,04", "4,045", "4,056", "4,064", "4,076", "4,088", "4,089", "4,1", "4,112", "4,121", "4,124", "4,136", "4,139", "4,16", "4,172", "4,178", "4,191", "4,204", "4,22", "4,232", "4,243", "4,256", "4,268", "4,28", "4,282", "4,304", "4,308", "4,316", "4,321", "4,328", "4,34", "4,352", "4,364", "4,388", "4,4", "4,425", "4,428", "4,442", "4,456", "4,484", "4,498", "4,526", "4,529", "4,54", "4,554", "4,582", "4,594", "4,596", "4,607", "4,624", "4,638", "4,652", "4,666", "4,685", "4,694", "4,708", "4,722", "4,736", "4,75", "4,764", "4,778", "4,792", "4,806", "4,82", "4,834", "4,841", "4,848", "4,876", "4,904", "4,918", "4,932", "4,945", "4,958", "4,96", "4,974", "4,984", "40,06", "40,164", "40,216", "40,528", "40,944", "41,048", "41,152", "41,412", "41,568", "41,88", "41,932", "42,4", "42,816", "45,29", "45,812", "46,16", "46,682", "47,088", "47,668", "47,842", "48,132", "49,872", "49,93", "5,016", "5,023", "5,03", "5,044", "5,058", "5,072", "5,075", "5,1", "5,114", "5,128", "5,142", "5,153", "5,184", "5,212", "5,218", "5,226", "5,24", "5,244", "5,254", "5,268", "5,282", "5,296", "5,324", "5,338", "5,352", "5,366", "5,374", "5,38", "5,394", "5,422", "5,436", "5,45", "5,475", "5,478", "5,492", "5,506", "5,52", "5,534", "5,562", "5,576", "5,59", "5,604", "5,618", "5,632", "5,646", "5,674", "5,685", "5,716", "5,73", "5,744", "5,8", "5,832", "5,848", "5,864", "5,88", "5,91", "5,912", "5,925", "5,928", "5,944", "5,96", "5,976", "50,8", "52,226", "52,474", "53,032", "53,962", "54,024", "54,458", "54,582", "55,078", "55,45", "55,636", "56,194", "57,068", "58,564", "59,176", "6,008", "6,024", "6,04", "6,045", "6,056", "6,072", "6,088", "6,104", "6,12", "6,136", "6,152", "6,232", "6,248", "6,264", "6,285", "6,296", "6,312", "6,328", "6,33", "6,345", "6,36", "6,375", "6,376", "6,408", "6,44", "6,45", "6,456", "6,472", "6,48", "6,488", "6,495", "6,504", "6,51", "6,52", "6,54", "6,552", "6,555", "6,584", "6,6", "6,632", "6,645", "6,648", "6,66", "6,664", "6,68", "6,705", "6,735", "6,744", "6,76", "6,776", "6,808", "6,824", "6,856", "6,872", "6,888", "6,904", "6,92", "6,936", "6,952", "6,968", "6,984", "60,808", "62,372", "63,664", "64,448", "65,024", "66,968", "68,336", "68,696", "69,632", "7,016", "7,08", "7,096", "7,112", "7,128", "7,144", "7,176", "7,192", "7,208", "7,24", "7,288", "7,304", "7,32", "7,336", "7,352", "7,368", "7,384", "7,4", "7,419", "7,438", "7,476", "7,533", "7,552", "7,571", "7,59", "7,609", "7,628", "7,647", "7,666", "7,685", "7,704", "7,723", "7,742", "7,78", "7,818", "7,837", "7,856", "7,913", "7,932", "7,97", "7,989", "70,712", "71,539", "74,388", "74,542", "76,929", "78,623", "8,008", "8,027", "8,084", "8,103", "8,122", "8,198", "8,217", "8,236", "8,2588", "8,274", "8,293", "8,331", "8,388", "8,407", "8,426", "8,464", "8,502", "8,54", "8,635", "8,673", "8,692", "8,711", "8,768", "8,787", "8,806", "8,882", "8,901", "8,92", "8,939", "8,958", "8,996", "80,212", "83,74", "86,092", "88,664", "9,034", "9,072", "9,11", "9,129", "9,205", "9,321", "9,405", "9,447", "9,489", "9,51", "9,552", "9,741", "9,783", "9,867", "9,888", "9,93", "9,951", "90,32", "91,608", "92,528", "94,552"), class = "factor"), Skawica.Dolna..Skawica. = structure(c(44L, 35L, 35L, 35L, 35L, 35L, 44L, 58L, 71L, 71L, 71L, 71L, 189L, 174L, 174L, 166L, 71L, 71L, 161L, 166L, 166L, 166L, 182L, 258L, 258L), .Label = c("", "1,023", "1,045", "1,056", "1,067", "1,078", "1,089", "1,1", "1,118", "1,136", "1,154", "1,172", "1,19", "1,208", "1,226", "1,244", "1,262", "1,298", "1,316", "1,334", "1,352", "1,37", "1,388", "1,406", "1,424", "1,43", "1,442", "1,46", "1,478", "1,496", "1,514", "1,518", "1,532", "1,55", "1,56", "1,568", "1,586", "1,604", "1,606", "1,622", "1,64", "1,658", "1,676", "1,686", "1,694", "1,712", "1,73", "1,7359", "1,748", "1,7644", "1,766", "1,7732", "1,782", "1,784", "1,7908", "1,7996", "1,802", "1,812", "1,8172", "1,82", "1,826", "1,838", "1,8436", "1,856", "1,8612", "1,87", "1,874", "1,892", "1,91", "1,928", "1,938", "1,946", "1,964", "1,982", "10", "10,014", "10,068", "10,08", "10,245", "10,32", "10,41", "10,44", "10,64", "10,641", "10,74", "10,812", "10,96", "101,5", "107", "11,07", "11,184", "11,268", "11,28", "11,4", "11,556", "11,6", "11,76", "11,92", "11,928", "110,3", "112,5", "116,9", "118", "12,12", "12,192", "12,228", "12,24", "12,3", "12,48", "12,56", "12,74", "12,84", "12,88", "122,8", "124", "13,2", "13,54", "13,56", "13,62", "13,88", "13,92", "130", "14,06", "14,22", "14,28", "14,5", "14,56", "14,64", "14,9", "14,94", "15", "15,24", "15,39", "15,58", "15,78", "15,82", "15,92", "16,17", "16,26", "16,56", "16,7", "16,95", "16,96", "17,34", "17,418", "17,68", "17,73", "18,12", "18,51", "18,66", "18,76", "18,9", "19,15", "19,32", "19,48", "19,74", "2", "2,029", "2,036", "2,058", "2,064", "2,087", "2,116", "2,145", "2,174", "2,19", "2,202", "2,203", "2,232", "2,261", "2,29", "2,348", "2,368", "2,376", "2,377", "2,406", "2,435", "2,464", "2,522", "2,534", "2,551", "2,562", "2,58", "2,609", "2,638", "2,696", "2,7", "2,725", "2,748", "2,783", "2,812", "2,87", "2,899", "2,928", "2,934", "2,957", "2,98", "2,986", "20,13", "20,16", "20,58", "21", "21,11", "21,42", "21,6", "21,84", "22,13", "22,26", "22,6", "22,66", "22,68", "23,1", "23,19", "23,56", "24,02", "24,2", "24,25", "24,48", "24,78", "24,94", "25,31", "25,354", "25,4", "25,84", "25,86", "25,96", "26,32", "26,78", "26,9", "27,24", "27,7", "28,67", "28,72", "28,9556", "29,09", "29,23", "29,58", "29,74", "29,85", "3,015", "3,044", "3,12", "3,131", "3,16", "3,189", "3,218", "3,26", "3,276", "3,334", "3,363", "3,392", "3,421", "3,45", "3,479", "3,508", "3,54", "3,595", "3,61", "3,624", "3,653", "3,74", "3,798", "3,82", "3,827", "3,856", "3,914", "3,972", "30,25", "30,76", "31,27", "31,78", "32,03", "32,29", "32,8", "33,01", "33,304", "33,36", "33,5", "33,92", "34,03", "34,48", "35,09", "35,6", "36", "36,68", "36,72", "37,28", "37,84", "38,4", "38,82", "39", "39,2", "39,6", "4,001", "4,03", "4,059", "4,1", "4,117", "4,146", "4,233", "4,291", "4,32", "4,345", "4,378", "4,39", "4,407", "4,494", "4,552", "4,59", "4,61", "4,639", "4,68", "4,697", "4,755", "4,813", "4,835", "4,871", "4,9", "4,97", "4,996", "40,2", "40,8", "41,4", "42", "42,5", "42,6", "43,2", "44,4", "45,04", "45,8", "45,808", "46,32", "46,96", "47,6", "48,24", "48,304", "48,88", "49,2", "49,52", "5,028", "5,06", "5,08", "5,124", "5,22", "5,26", "5,325", "5,348", "5,38", "5,444", "5,476", "5,54", "5,55", "5,57", "5,636", "5,796", "5,828", "5,84", "5,86", "5,871", "5,956", "50,16", "50,8", "52,6", "52,78", "53,308", "56,05", "56,278", "56,872", "57,4", "57,604", "58,896", "59,44", "59,5", "59,848", "59,984", "6,084", "6,13", "6,148", "6,172", "6,18", "6,308", "6,404", "6,42", "6,473", "6,5", "6,628", "6,66", "6,71", "6,774", "6,82", "6,884", "6,948", "6,98", "60,8", "61,344", "62,16", "62,976", "63,15", "63,52", "66,8", "67,12", "67,193", "67,558", "67,85", "68,58", "68,726", "69,456", "7", "7,075", "7,14", "7,204", "7,236", "7,3", "7,364", "7,376", "7,46", "7,492", "7,6", "7,677", "7,716", "7,78", "7,844", "7,9", "7,978", "70,04", "70,77", "71,208", "71,354", "71,4", "71,5", "72,54", "75,42", "76", "76,14", "76,3", "77,26", "79,5", "8,1", "8,166", "8,2", "8,232", "8,279", "8,364", "8,43", "8,5", "8,58", "8,628", "8,727", "8,76", "8,793", "8,8", "8,952", "80,35", "81", "82,05", "83,75", "86", "86,3", "88,9", "89,8", "9,09", "9,1", "9,189", "9,324", "9,4", "9,42", "9,486", "9,585", "9,696", "9,7", "9,75", "9,849", "90,25", "91", "96"), class = "factor")), .Names = c("Data", "Osielec", "Jordanów", "Skawica.Dolna..Skawica."), row.names = 26:50, class = "data.frame")
You need to convert your variable into numeric and you're more or less done. I use scale_y_continuous here. library(ggplot2) dane_dlugie$Osielec <- as.numeric(dane_dlugie$Osielec) ggplot(dane_dlugie, aes(x=Data, y=Osielec, group=1)) + geom_line()+labs(x="Data", y="Flow") + ggtitle("Osielec")+ scale_y_continuous(breaks = 1:5) + scale_x_datetime(date_breaks = "1 day", date_labels = "%d-%m")
Order axis when doing a bubble chart using plotly in R
I have a bubble chart using plotly in R but the order of the axis appear to be somehow odd. The output is as follows and you can see how the axis are not correct: The code that I'm using is as follows library(plotly) library(ggplot2) file <- c("C://link//data.csv") #dataSource <- read.csv(file, sep =",", header = TRUE) dataSource <- read.table(file, header=T, sep=",") dataSource <- na.omit(dataSource) slope <- 1 dataSource$size <- sqrt(dataSource$Y.1 * slope) colors <- c('#4AC6B7', '#1972A4') #, '#965F8A', '#FF7070', '#C61951') plot_ly(dataSource, x = ~Y.1.vs.Y.2, y = ~YTD.vs.Y.1.YTD, color = ~BU, size = ~size, colors = colors, type = 'scatter', mode = 'markers', sizes = c(min(dataSource$size), max(dataSource$size)), marker = list(symbol = 'circle', sizemode = 'diameter', line = list(width = 2, color = '#FFFFFF')), text = ~paste('Business Unit:', BU, '<br>Product:', Product, '<br>Y.1.vs.Y.2:', Y.1.vs.Y.2, '<br>YTD.vs.Y.1.YTD:', YTD.vs.Y.1.YTD)) %>% layout(title = 'Y.1.vs.Y.2 v. YTD.vs.Y.1.YTD', xaxis = list(title = 'Y.1.vs.Y.2', gridcolor = 'rgb(255, 255, 255)', zerolinewidth = 1, ticklen = 5, gridwidth = 2), yaxis = list(title = 'YTD.vs.Y.1.YTD', gridcolor = 'rgb(255, 255, 255)', zerolinewidth = 1, ticklen = 5, gridwith = 2), paper_bgcolor = 'rgb(243, 243, 243)', plot_bgcolor = 'rgb(243, 243, 243)') The data is as follows: structure(list(BU = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("B", "D"), class = "factor"), Product = structure(c(4L, 5L, 7L, 8L, 9L, 13L, 1L, 3L, 4L, 11L, 12L, 13L), .Label = c("ADT", "BHL", "CEX", "CMX", "CTL", "HTH", "MTL", "SSL", "TLS", "UTV", "WEX", "WLD", "WMX"), class = "factor"), Y.2 = c(4065L, 499L, 20L, 5491L, 781L, 53L, 34L, 1338L, 557L, 428L, 310L, 31L), Y.1 = c(4403L, 550L, 28L, 5225L, 871L, 46L, 22L, 1289L, 602L, 426L, 318L, 37L), Y.1.YTD = c(4403L, 550L, 28L, 5225L, 871L, 46L, 22L, 1289L, 602L, 426L, 318L, 37L), YTD = c(5026L, 503L, 29L, 3975L, 876L, 40L, 62L, 1395L, 717L, 423L, 277L, 35L), Y.1.vs.Y.2 = structure(c(12L, 7L, 11L, 4L, 8L, 1L, 2L, 3L, 12L, 6L, 10L, 9L), .Label = c("-13%", "-35%", "-4%", "-5%", "-76%", "0%", "10%", "12%", "19%", "3%", "40%", "8%"), class = "factor"), YTD.vs.Y.1.YTD = structure(c(8L, 5L, 11L, 3L, 7L, 2L, 9L, 12L, 10L, 1L, 2L, 4L), .Label = c("-1%", "-13%", "-24%", "-5%", "-9%", "0%", "1%", "14%", "182%", "19%", "4%", "8%"), class = "factor")), .Names = c("BU", "Product", "Y.2", "Y.1", "Y.1.YTD", "YTD", "Y.1.vs.Y.2", "YTD.vs.Y.1.YTD"), row.names = c(2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 13L, 14L, 15L), class = "data.frame", na.action = structure(c(1L, 7L, 12L), .Names = c("1", "7", "12"), class = "omit")) Any ideas on how can I order the axis properly? Thanks
There are a few ways to manipulate factor levels, but things can get a bit messy if you're not careful. You should familiarize yourself with ?levels and ?factor, as well as maybe ?reorder, ?relevel In the meantime, try something like this dataSource[[7]] <- factor(dataSource[[7]], levels = c("-76%", "-35%", "-13%", "-5%", "-4%", "0%", "3%", "8%", "10%", "12%", "19%", "40%")) Edit To consolidate my answer and comment... This behaviour is caused because of the way factors are encoded. Your axes are strings and factor order is determined alphnumerically. So to change their order you have to specify it as above, or else code them numerically and give them the required names. There are many different ways to change them, in several packages. This answer provides a standard base R method for handling factors. For further info start with the manual pages I suggested. As for it being "very manual", since factors are categorical (and therefore have a potentially arbitrary order), there is no way to automate their order unless you code them numerically in the desired order.
Thanks to the comments above I've been able to resolve the issue. Find below the full code, which I hope might help other users: library(plotly) library(ggplot2) file <- c("C://link//data.csv") dataSource <- read.table(file, header=T, sep=",") dataSource <- na.omit(dataSource) # Additional code to format the input values and recalculate the percentages BUValues = dataSource$BU ProductValues = dataSource$Product dataSource <-, stringsAsfactors = FALSE) dataSource$BU = BUValues dataSource$Product = ProductValues dataSource$Y.1.vs.Y.2 = round((dataSource$Y.1/dataSource$Y.2 -1)*100,2) dataSource$YTD.vs.Y.1.YTD = round((dataSource$YTD/dataSource$Y.1.YTD -1)*100,2) slope <- 1 dataSource$size <- sqrt(dataSource$Y.1 * slope) colors <- c('#4AC6B7', '#1972A4') #, '#965F8A', '#FF7070', '#C61951') plot_ly(dataSource, x = ~Y.1.vs.Y.2, y = ~YTD.vs.Y.1.YTD, color = ~BU, size = ~size, colors = colors, type = 'scatter', mode = 'markers', sizes = c(min(dataSource$size), max(dataSource$size)), marker = list(symbol = 'circle', sizemode = 'diameter', line = list(width = 2, color = '#FFFFFF')), text = ~paste('Business Unit:', BU, '<br>Product:', Product, '<br>YoY:',Y.1.vs.Y.2, '<br>YTD:',YTD.vs.Y.1.YTD)) %>% layout(title = 'YoY vs YTD Performance', xaxis = list(title = 'YoY Performance (%)', gridcolor = 'rgb(255, 255, 255)', zerolinewidth = 1, ticklen = 5, gridwidth = 2), yaxis = list(title = 'YTD Performance (%)', gridcolor = 'rgb(255, 255, 255)', zerolinewidth = 1, ticklen = 5, gridwith = 2), paper_bgcolor = 'rgb(243, 243, 243)', plot_bgcolor = 'rgb(243, 243, 243)')
NanoStringDiff produces very large logFC values
I am trying to use NanoStringDiff to identify differentially expressed microRNAs between treatment and control samples, but I am obtaining extraordinarily large values for the fold change (log2FC is 27 for certain genes, but expected to be around 1 by inspection). I think there is an error with my code. I am using the following guide I've included my code below and the data I am using is publicly available on (GSE84971_non-normalized.txt.gz) I took this data and converted it into an Excel file in the format as specified in the guide which I called "data". Any help is much appreciated. source("") biocLite() biocLite("NanoStringDiff") biocLite("Biobase") library("BioBase") library("NanoStringDiff") designs=data.frame(group=c("Control","Control","Control", "Treatment", "Treatment", "Treatment")) designs directory <- "/Users/admin/Desktop" path <- paste(directory, "data.csv", sep ="/", collapse = NULL) NanoStringData=createNanoStringSetFromCsv(path,header=TRUE,designs) pheno=pData(NanoStringData) group=pheno$group design.full=model.matrix(~0+group) #create a design (or model) matrix design.full contrast=c(-1,1) NanoStringData=estNormalizationFactors(NanoStringData) positiveFactor(NanoStringData) negativeFactor(NanoStringData) housekeepingFactor(NanoStringData) result=glm.LRT(NanoStringData,design.full, Beta= ncol(design.full), contrast=contrast) result$table I attempted to add the data using dput(NanoString), hopefully this makes my code more self-contained. Below is shown the output. new("NanoStringSet" , positiveFactor = numeric(0) , negativeFactor = numeric(0) , housekeepingFactor = numeric(0) , positiveControl = structure(c(51117L, 9153L, 2357L, 749L, 133L, 88L, 30283L, 6423L, 1178L, 444L, 83L, 35L, 46143L, 8040L, 2014L, 554L, 114L, 82L, 48365L, 9338L, 2158L, 603L, 135L, 91L, 52744L, 10177L, 2391L, 786L, 143L, 72L, 70189L, 12069L, 3186L, 693L, 176L, 110L), .Dim = c(6L, 6L), .Dimnames = list(c("POS_A(128)", "POS_B(32)", "POS_C(8)", "POS_D(2)", "POS_E(0.5)", "POS_F(0.125)"), c("Control.1", "Control.2", "Control.3", "Treatment.1", "Treatment.2", "Treatment.3"))) , negativeControl = structure(c(52L, 32L, 40L, 14L, 104L, 74L, 28L, 25L, 29L, 11L, 74L, 45L, 31L, 32L, 29L, 13L, 80L, 60L, 61L, 44L, 32L, 5L, 103L, 74L, 56L, 42L, 44L, 15L, 135L, 62L, 55L, 54L, 36L, 12L, 108L, 61L), .Dim = c(6L, 6L), .Dimnames = list(c("NEG_B(0)", "NEG_C(0)", "NEG_A(0)", "NEG_F(0)", "NEG_E(0)", "NEG_D(0)"), c("Control.1", "Control.2", "Control.3", "Treatment.1", "Treatment.2", "Treatment.3" ))) , housekeepingControl = structure(c(825L, 1892L, 1293L, 1496L, 2157L, 1254L, 1081L, 1121L, 914L, 1223L, 2123L, 3912L, 1876L, 2217L, 3363L, 1392L, 1750L, 1626L, 1196L, 1917L, 1378L, 1446L, 1300L, 1077L, 1875L, 2098L, 8006L, 2989L, 4447L, 4930L), .Dim = 5:6, .Dimnames = list(c("Actb|0", "B2m|0", "Gapdh|0", "Rpl19|0", "Rplp0|0"), c("Control.1", "Control.2", "Control.3", "Treatment.1", "Treatment.2", "Treatment.3"))) , experimentData = new("MIAME" , name = "" , lab = "" , contact = "" , title = "" , abstract = "" , url = "" , pubMedIds = "" , samples = list() , hybridizations = list() , normControls = list() , preprocessing = list() , other = list() , .__classVersion__ = new("Versions" , .Data = list(c(1L, 0L, 0L), c(1L, 1L, 0L)) ) ) , assayData = <environment> , phenoData = new("AnnotatedDataFrame" , varMetadata = structure(list(labelDescription = NA_character_), .Names = "labelDescription", row.names = "group", class = "data.frame") , data = structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("Control", "Treatment"), class = "factor")), .Names = "group", row.names = c("Control.1", "Control.2", "Control.3", "Treatment.1", "Treatment.2", "Treatment.3" ), class = "data.frame") , dimLabels = c("sampleNames", "sampleColumns") , .__classVersion__ = new("Versions" , .Data = list(c(1L, 1L, 0L)) ) ) , featureData = new("AnnotatedDataFrame" , varMetadata = structure(list(labelDescription = character(0)), .Names = "labelDescription", row.names = character(0), class = "data.frame") , data = structure(list(), .Names = character(0), class = "data.frame", row.names = c("hsa-miR-10a-5p|0", "hsa-miR-1234|0", "hsa-miR-185-5p|0", "hsa-miR-27a-3p", "hsa-miR-34c-3p", "hsa-miR-1181|0", "hsa-miR-601", "hsa-miR-4454")) , dimLabels = c("featureNames", "featureColumns") , .__classVersion__ = new("Versions" , .Data = list(c(1L, 1L, 0L)) ) ) , annotation = character(0) , protocolData = new("AnnotatedDataFrame" , varMetadata = structure(list(labelDescription = character(0)), .Names = "labelDescription", row.names = character(0), class = "data.frame") , data = structure(list(), .Names = character(0), class = "data.frame", row.names = c("Control.1", "Control.2", "Control.3", "Treatment.1", "Treatment.2", "Treatment.3" )) , dimLabels = c("sampleNames", "sampleColumns") , .__classVersion__ = new("Versions" , .Data = list(c(1L, 1L, 0L)) ) ) , .__classVersion__ = new("Versions" , .Data = list(c(3L, 3L, 1L), c(2L, 34L, 0L), c(1L, 3L, 0L), c(1L, 0L, 0L)) ) ) Thanks!
How to run a function against several dataframes and output dataframes with the same name as input in R
I have several dataframes that I am applying a function to The function works but I would like to lapply it to several dataframes and output the result according to the input names. Here is an example of one of the dataframes structure(list(chr = structure(c(1L, 1L, 1L), .Label = c("chr1", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr2", "chr20", "chr21", "chr22", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chrX", "chrY"), class = "factor"), leftPos = c(100260254L, 100735342L, 100805662L), strand.x = structure(c(1L, 1L, 2L), .Label = c("-", "+"), class = "factor"), X50CellJ_SLX.9395.FSeqJ.fq.gz = c(7L, 295L, 132L), Cytospongex10_SLX.9395.FSeqK.fq.gz = c(72L, 256L, 148L), FFPE20X_SLX.9395.fq.gz = c(5L, 74L, 36L), Tumour10_SMACCO_AH_088_SLX.9396.FSeqH.fq.gz = c(13L, 154L, 65L), Tumour11_SMACCO_SH_020_SLX.9396.FSeqI.fq.gz = c(1L, 0L, 0L), Tumour12_SMACCO_ED_008_SLX.9396.FSeqJ.fq.gz = c(3L, 25L, 8L), Tumour13_SMACCO_AH_086_SLX.9396.FSeqK.fq.gz = c(7L, 120L, 28L), Tumour1_SMACCO_AH_100_SLX.9396.FSeqA.fq.gz = c(0L, 0L, 0L), Tumour2_SMACCO_AH_058_SLX.9396.FSeqB.fq.gz = c(24L, 98L, 42L), Tumour3_SMACCO_SH_051_SLX.9396.FSeqC.fq.gz = c(29L, 92L, 29L), Tumour4_SMACCO_ED_031_SLX.9396.FSeqD.fq.gz = c(18L, 53L, 14L), Tumour5_SMACCO_RS_027_SLX.9396.FSeqE.fq.gz = c(8L, 93L, 17L), Tumour7_SMACCO_AH_026_SLX.9396.FSeqF.fq.gz = c(30L, 205L, 60L), Tumour9_SMACCO_ST_024_SLX.9396.FSeqG.fq.gz = c(15L, 129L, 17L), strand.y = structure(c(1L, 1L, 2L), .Label = c("-", "+"), class = "factor"), Tumour14_SMACCO_AH_094_SLX.9394.FSeqA.fq.gz = c(0L, 7L, 3L), Tumour15_SMACCO_WG_006_SLX.9394.FSeqB..fq.gz = c(3L, 19L, 4L), Tumour16_SMACCO_ST_035_SLX.9394.FSeqC.fq.gz = c(1L, 23L, 8L), Tumour17_SMACCO_ST_034_SLX.9394.fq.gz = c(7L, 26L, 5L), Control19_SLX.9394.FSeqE.fq.gz = c(51L, 256L, 36L), Control20_SLX.9394.FSeqF.fq.gz = c(23L, 110L, 34L), Control21_SLX.9394.FSeqG..fq.gz = c(30L, 56L, 11L), Control22_SLX.9394.FSeqH.fq.gz = c(22L, 72L, 24L), Control23_SLX.9394.FSeqI.fq.gz = c(10L, 23L, 2L), Control25_SLX.9394.FSeqJ.fq.gz = c(17L, 72L, 8L), Control27_SLX.9394.FSeqK.fq.gz = c(10L, 21L, 9L), Control28_SLX.9395.FSeqA.fq.gz = c(13L, 40L, 4L), Control29_SLX.9395.FSeqB.fq.gz = c(14L, 39L, 6L), Control30_SLX.9395.FSeqC.fq.gz = c(5L, 32L, 5L), Control31_SLX.9395.FSeqD.fq.gz = c(7L, 11L, 5L), Control32_SLX.9395.FSeqE.fq.gz = c(5L, 32L, 4L), Control33_SLX.9395.FSeqF.fq.gz = c(10L, 25L, 6L), Control34_SLX.9395.FSeqG.fq.gz = c(3L, 32L, 1L), Control35_SLX.9395.FSeqH.fq.gz = c(10L, 33L, 0L), Controls = c(0L, 0L, 0L), Samples = c(0L, 0L, 0L)), .Names = c("chr", "leftPos", "strand.x", "X50CellJ_SLX.9395.FSeqJ.fq.gz", "Cytospongex10_SLX.9395.FSeqK.fq.gz", "FFPE20X_SLX.9395.fq.gz", "Tumour10_SMACCO_AH_088_SLX.9396.FSeqH.fq.gz", "Tumour11_SMACCO_SH_020_SLX.9396.FSeqI.fq.gz", "Tumour12_SMACCO_ED_008_SLX.9396.FSeqJ.fq.gz", "Tumour13_SMACCO_AH_086_SLX.9396.FSeqK.fq.gz", "Tumour1_SMACCO_AH_100_SLX.9396.FSeqA.fq.gz", "Tumour2_SMACCO_AH_058_SLX.9396.FSeqB.fq.gz", "Tumour3_SMACCO_SH_051_SLX.9396.FSeqC.fq.gz", "Tumour4_SMACCO_ED_031_SLX.9396.FSeqD.fq.gz", "Tumour5_SMACCO_RS_027_SLX.9396.FSeqE.fq.gz", "Tumour7_SMACCO_AH_026_SLX.9396.FSeqF.fq.gz", "Tumour9_SMACCO_ST_024_SLX.9396.FSeqG.fq.gz", "strand.y", "Tumour14_SMACCO_AH_094_SLX.9394.FSeqA.fq.gz", "Tumour15_SMACCO_WG_006_SLX.9394.FSeqB..fq.gz", "Tumour16_SMACCO_ST_035_SLX.9394.FSeqC.fq.gz", "Tumour17_SMACCO_ST_034_SLX.9394.fq.gz", "Control19_SLX.9394.FSeqE.fq.gz", "Control20_SLX.9394.FSeqF.fq.gz", "Control21_SLX.9394.FSeqG..fq.gz", "Control22_SLX.9394.FSeqH.fq.gz", "Control23_SLX.9394.FSeqI.fq.gz", "Control25_SLX.9394.FSeqJ.fq.gz", "Control27_SLX.9394.FSeqK.fq.gz", "Control28_SLX.9395.FSeqA.fq.gz", "Control29_SLX.9395.FSeqB.fq.gz", "Control30_SLX.9395.FSeqC.fq.gz", "Control31_SLX.9395.FSeqD.fq.gz", "Control32_SLX.9395.FSeqE.fq.gz", "Control33_SLX.9395.FSeqF.fq.gz", "Control34_SLX.9395.FSeqG.fq.gz", "Control35_SLX.9395.FSeqH.fq.gz", "Controls", "Samples"), row.names = c(NA, 3L), class = "data.frame") Here is what I have so far mylist <- list(A = OriginalMeta , B = SLX9392 , C = SLX9393, D = SLX9397, E = Gastric, F = Dysplasia, G = GoodDysplasia, H = Cholangio, I = LCM_PS14_1105_1F) sortIt <- function(df1) { df1$strand.x<- NULL df1$strand.y<- NULL df1$strand<-NULL df1$X.<-NULL names(df1)[1] <- c("chr") #Get rid of X and Y chromosomes df1 <- df1[!grepl("chrX", df1$chr), ] df1 <- df1[!grepl("chrY", df1$chr), ] xyAss3<-df1 return(xyAss3) } lapply(names(mylist), sortIt(x)write.csv(mylist[x], file =paste0(x,'.csv'))) The thing is I just dont know how to feed the mylist into the function. Should I call x in the lapply df1? I'm a bit confused as to how to tie it all together.
I think you'll do better to fold the creation of the .csv into your function and then use a for loop to apply that function to each object in your list in turn. So something like this, where df is the sample data frame you posted: mylist <- list(A = df, B = df) sortIt <- function(i) { df = mylist[[i]] df[,"strand.x"] <- NULL df[,"strand.y"] <- NULL df[,"strand"] <- NULL df[,"X."] <- NULL names(df) <- c("chr", names(df)[2:length(names(df))]) df <- df[!grepl("chrX", df$chr), ] df <- df[!grepl("chrY", df$chr), ] write.csv(df, file = paste0(names(mylist)[i], ".csv"), row.names=FALSE) } for (i in seq(length(mylist))) {sortIt(i)} If you were trying to create a new object in your workspace, then one of the apply functions would be a better bet. But when you're trying to output files, I think you need to use a for loop instead.
Not really sure what you are trying to achieve, but guessing that you want to save the transformed data frame to a file with a name taken from the list, this could do the job (it should work with the rest of your code - note the [[1]]): lapply(names(mylist), function(x) write.csv(sortIt(mylist[x][[1]]), file = paste0(x,'.csv'))) Another option is to use mapply, here I'm attaching a complete example: # create the data dframes <- lapply(1:3, function(x) data.frame(x=rnorm(10), y=runif(10))) names(dframes) <- LETTERS[1:3] # the transformation function sortdf <- function(df) df[order(df$x),] # two variants of apply lapply(names(dframes), function(name) write.csv(sortdf(dframes[name][[1]]), file=paste0(name, '.csv'))) # mapply does not have the ugly [[1]] syntax bit, I'd prefer it myself mapply(function(name, df) write.csv(sortdf(df), file=paste0(name, '.csv')), names(dframes), dframes)