Extract chart data from powerpoint slides - r

Given a powerpoint file with a chart containing chart data, how can I extract the chart data as a data frame? That is, given the the tempf.pptx file, how can I retrieve the iris dataset?
library(magrittr)
library(mschart)
library(officer)
linec <- ms_linechart(data = iris, x = "Sepal.Length",
y = "Sepal.Width", group = "Species")
linec <- chart_ax_y(linec, num_fmt = "0.00", rotation = -90)
doc <- read_pptx()
doc <- add_slide(doc, layout = "Title and Content", master = "Office Theme")
doc <- ph_with_chart(doc, chart = linec)
print(doc, target = tempf.pptx <- tempfile(fileext = ".pptx"))

Another approach would be to directly import the xls file associated with the chart :
tempdir <- tempfile()
officer::unpack_folder(tempf.pptx, tempdir)
xl_file <- list.files(tempdir, recursive = TRUE, full.names = TRUE, pattern = "\\.xlsx$")
readxl::read_excel(xl_file)
Note: this code only works because there is only one dataset in the pptx file. If there were more than a file, the relationships file *.xml.rels should be read to be sure we import the correct xlsx file (the xl reference is stored in ppt/charts/_rels/chart_file_title.xml.rels)

"Cut and paste" is a seriously flawed anti-pattern for reproducible code & analyses or automation (all things we strive for in data science workflows).
This is starter code that gets you to the data elements (but you still have some "roll up your sleeves" work to do
library(xml2)
library(magrittr)
# temp holding space for the unzipped PPTX
td <- tempfile("dir")
# unzip it and keep file names
fils <- unzip(tempf.pptx, exdir = td)
# look for chart XML files
charts <- fils[grepl("chart.*\\.xml$", fils)]
# read in the first one
chart <- read_xml(charts[1])
Now that we found and read in a chart XML file, let's see if we figure out which kind of chart it is:
# find charts in the XML (i don't know if there can be more than one per-XML file)
(embedded_charts <- xml_find_all(chart, ".//c:chart/c:plotArea"))
## {xml_nodeset (1)}
## [1] <c:plotArea xmlns:c="http://schemas.openxmlformats.org/drawingml/200 ...
# get the node root of the first one (again, i'm not sure if there can be more than one)
(first_embed <- embedded_charts[1])
## {xml_nodeset (1)}
## [1] <c:plotArea xmlns:c="http://schemas.openxmlformats.org/drawingml/200 ...
# use it to get the kind of chart so we can target the values with it
(xml_children(first_embed) %>%
xml_name() %>%
grep("Chart", ., value=TRUE) -> embed_kind)
## [1] "lineChart"
Now we can try to find the data series for that chart.
(target <- xml_find_first(first_embed, sprintf(".//c:%s", embed_kind)))
## {xml_nodeset (1)}
## [1] <c:lineChart>\n <c:grouping val="standard"/>\n <c:varyColors val=" ...
# extract "column" metadata
col_refs <- xml_find_all(target, ".//c:ser/c:tx/c:strRef")
(xml_find_all(col_refs, ".//c:f") %>%
sapply(xml_text) -> col_specs)
## [1] "sheet1!$B$1" "sheet1!$C$1" "sheet1!$D$1"
(xml_find_all(col_refs, ".//c:v") %>%
sapply(xml_text))
## [1] "setosa" "versicolor" "virginica"
Extract "X" metadata & data:
x_val_refs <- xml_find_all(target, ".//c:cat")
(lapply(x_val_refs, xml_find_all, ".//c:f") %>%
sapply(xml_text) -> x_val_specs)
## [1] "sheet1!$A$2:$A$36" "sheet1!$A$2:$A$36" "sheet1!$A$2:$A$36"
(lapply(x_val_refs, xml_find_all, ".//c:v") %>%
sapply(xml_double) -> x_vals)
## [,1] [,2] [,3]
## [1,] 4.3 4.3 4.3
## [2,] 4.4 4.4 4.4
## [3,] 4.5 4.5 4.5
## [4,] 4.6 4.6 4.6
## [5,] 4.7 4.7 4.7
## [6,] 4.8 4.8 4.8
## [7,] 4.9 4.9 4.9
## [8,] 5.0 5.0 5.0
## [9,] 5.1 5.1 5.1
## [10,] 5.2 5.2 5.2
## [11,] 5.3 5.3 5.3
## [12,] 5.4 5.4 5.4
## [13,] 5.5 5.5 5.5
## [14,] 5.6 5.6 5.6
## [15,] 5.7 5.7 5.7
## [16,] 5.8 5.8 5.8
## [17,] 5.9 5.9 5.9
## [18,] 6.0 6.0 6.0
## [19,] 6.1 6.1 6.1
## [20,] 6.2 6.2 6.2
## [21,] 6.3 6.3 6.3
## [22,] 6.4 6.4 6.4
## [23,] 6.5 6.5 6.5
## [24,] 6.6 6.6 6.6
## [25,] 6.7 6.7 6.7
## [26,] 6.8 6.8 6.8
## [27,] 6.9 6.9 6.9
## [28,] 7.0 7.0 7.0
## [29,] 7.1 7.1 7.1
## [30,] 7.2 7.2 7.2
## [31,] 7.3 7.3 7.3
## [32,] 7.4 7.4 7.4
## [33,] 7.6 7.6 7.6
## [34,] 7.7 7.7 7.7
## [35,] 7.9 7.9 7.9
Extract "Y" metadata and data:
y_val_refs <- xml_find_all(target, ".//c:val")
(lapply(y_val_refs, xml_find_all, ".//c:f") %>%
sapply(xml_text) -> y_val_specs)
## [1] "sheet1!$B$2:$B$36" "sheet1!$C$2:$C$36" "sheet1!$D$2:$D$36"
(lapply(y_val_refs, xml_find_all, ".//c:v") %>%
sapply(xml_double) -> y_vals)
## [[1]]
## [1] 3.0 3.2 2.3 3.2 3.2 3.0 3.6 3.3 3.8 4.1 3.7 3.4 3.5 3.8 4.0
##
## [[2]]
## [1] 2.4 2.3 2.5 2.7 3.0 2.6 2.7 2.8 2.6 3.2 3.4 3.0 2.9 2.3 2.9 2.8 3.0
## [18] 3.1 2.8 3.1 3.2
##
## [[3]]
## [1] 2.5 2.8 2.5 2.7 3.0 3.0 2.6 3.4 2.5 3.1 3.0 3.0 3.2 3.1 3.0 3.0 2.9
## [18] 2.8 3.0 3.0 3.8
# see if there are X & Y titles
title_nodes <- xml_find_all(first_embed, ".//c:title")
(lapply(title_nodes, xml_find_all, ".//a:t") %>%
sapply(xml_text) -> titles)
## [1] "Sepal.Length" "Sepal.Width"
Unlike the impetus behind my docxtractr package (for getting tables out of Word docs) I haven't seen much call for this particular need much so I'm not sure there will be a package for the above idiom in the near future.

I don't know of a way to get the data from within R, but you could open up the pptx file, right-click the chart, and select "Edit Data" to see the underlying data in table form. Could then copy and paste into an R data frame using the handy datapasta package.

Related

How to pass the filtered dataframe to a subsequent function?

I'm trying to pass a filtered dataframe onto a subsequent function.
Consider Iris dataframe. I filter out only on Versicolor species and then I want to use Sepal.Length and Sepal.Width column into a function that takes two vectors. I'm currently trying to implement DouglasPeuckerNbPoints, so I will use this as an example
iris %>%
filter(
(Species == "versicolor"))
I have tried:
library(kmlShape)
iris %>%
filter(
(Species == "versicolor")) %>%
DouglasPeuckerNbPoints(.$Sepal.Length,.$Sepal.Width,20)
But this is giving me the error "Error in xy.coords(x, y, setLab = FALSE) : 'x' and 'y' lengths differ".
Any help here?
The following works. We can put the function inside {}. This is called lambda expression as there are more than one dot. See https://magrittr.tidyverse.org/reference/pipe.html for more information.
library(tidyverse)
library(kmlShape)
iris %>%
filter(Species == "versicolor") %>%
{DouglasPeuckerNbPoints(trajx = .$Sepal.Length,
trajy = .$Sepal.Width, 20)}
# x y
# 1 7.0 3.2
# 2 4.9 2.4
# 3 6.6 2.9
# 4 5.2 2.7
# 5 5.0 2.0
# 6 5.9 3.0
# 7 6.0 2.2
# 8 5.6 2.9
# 9 6.7 3.1
# 10 5.6 3.0
# 11 6.2 2.2
# 12 5.9 3.2
# 13 6.7 3.0
# 14 5.5 2.4
# 15 5.4 3.0
# 16 6.7 3.1
# 17 6.3 2.3
# 18 5.6 3.0
# 19 5.0 2.3
# 20 5.7 2.8

How to obtain values (e.g. median) from a boxplot in r?

I’ve plotted a boxplot for PM2.5 levels per year.
Boxplot(PM2.5~year, data=subset(dat, hour==12), las=1)
How can I extract values such as the median from the boxplots?
The default boxplot function returns summaries invisibly, you just have to assign it to a variable:
res <- boxplot(Sepal.Length ~ Species, data=iris)
Within res there exists an element stats:
> res$stats
[,1] [,2] [,3]
[1,] 4.3 4.9 5.6
[2,] 4.8 5.6 6.2
[3,] 5.0 5.9 6.5
[4,] 5.2 6.3 6.9
[5,] 5.8 7.0 7.9
These are quartile summaries of the boxes. The median is the middle one, so:
> res$stats[3,]
[1] 5.0 5.9 6.5

R: Free hand selection of data points in scatter plots

I would like to know if there is any good way to allow me getting the id of the points from a scatter plot by drawing a free hand polygon in R?
I found scatterD3 and it looks nice, but I can't manage to output the lab to a variable in R.
Thank you.
Roc
Here's one way
library(iplots)
with(iris, iplot(Sepal.Width,Petal.Width))
Use SHIFT (xor) or SHIFT+ALT (and) to select points (red):
Then:
iris[iset.selected(), ]
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# 119 7.7 2.6 6.9 2.3 virginica
# 115 5.8 2.8 5.1 2.4 virginica
# 133 6.4 2.8 5.6 2.2 virginica
# 136 7.7 3.0 6.1 2.3 virginica
# 146 6.7 3.0 5.2 2.3 virginica
# 142 6.9 3.1 5.1 2.3 virginica
gives you the selected rows.
The package "gatepoints" available on CRAN will allow you to draw a gate returning your points of interest.
The explanation is quite clear for anyone who reads the question. The link simply links to a package that can be used as follows:
First plot your points
x <- data.frame(x=1:10, y=1:10)
plot(x, col = "red", pch = 16)
Then select your points after running the following commands:
selectedPoints <- fhs(x)
This will return:
selectedPoints
#> [1] "4" "5" "7"
#> attr(,"gate")
#> x y
#> 1 6.099191 8.274120
#> 2 8.129107 7.048649
#> 3 8.526881 5.859404
#> 4 5.700760 6.716428
#> 5 5.605314 5.953430
#> 6 6.866882 3.764390
#> 7 3.313575 3.344069
#> 8 2.417270 5.217868

How can I reshape this data?

I have some data to reshape in R but can not figure out how.
Here is the scenario:
I have data like this
a<- c("exam1", "exam2", "exam3","exam4")
date1<- c(8.2,4.3,6.7,3.9)
date2<- c(11.2,9.3,6.5,4.1)
date3<- c(8.2,9.1,4.3,4.4)
dr.df.a <- cbind(a,date1,date2,date3)
a date1 date2 date3
[1,] "exam1" "8.2" "11.2" "8.2"
[2,] "exam2" "4.3" "9.3" "9.1"
[3,] "exam3" "6.7" "6.5" "4.3"
[4,] "exam4" "3.9" "4.1" "4.4"
b<- c("exam1", "exam2", "exam3","exam4")
date1<- c(8.6,14.3,6.7,13.9)
date2<- c(11.2,8.3,16.5,14.1)
date3<- c(4.2,9.1,4.3,14.4)
dr.df.b <- cbind(b,date1,date2,date3)
b date1 date2 date3
[1,] "exam1" "8.6" "11.2" "4.2"
[2,] "exam2" "14.3" "8.3" "9.1"
[3,] "exam3" "6.7" "16.5" "4.3"
[4,] "exam4" "13.9" "14.1" "14.4"
mylist<–list(dr.df.a,dr.df.b)
The example is for reproducibly proposes. I get the data in this format (dr.df.a and dr.df.b) There are multiple data frames in list object.
Now I need to reshape it a way to get one single line and variable names like
exam1_date1, exam1_date2 , exam1_date3, exam2_date1,exam2_date2 ... and so on and essentially I would like to get data frame with rows of exam1_date1, exam1_date2 , exam1_date3, exam2_date1,exam2_date2 ... for every data frame in list object.
How I can reshape this data and which function should I use ?
Try this:
library(reshape2)
# convert the first row (the one defined by variable 'a' in post) into column names
dr.df.2 <- setNames(dr.df[-1,], dr.df[1, ])
m <- melt(dr.df.2)
d <- dcast(m, 1 ~ ...)[-1]
names(d) <- sub("_", "_exam", names(d)) # fix up names (optional)
Giving this:
> d
date1_exam1 date1_exam2 date1_exam3 date1_exam4 date2_exam1 date2_exam2
1 8.2 4.3 6.7 3.9 11.2 9.3
date2_exam3 date2_exam4 date3_exam1 date3_exam2 date3_exam3 date3_exam4
1 6.5 4.1 8.2 9.1 4.3 4.4
UPDATE: simplified dcast formula
If your dr.df object were a data.frame instead of a matrix, you can easily create a named vector as demonstrated below:
Create your data, but as a data.frame this time:
a <- c("exam1", "exam2", "exam3","exam4")
date1 <- c(8.2,4.3,6.7,3.9)
date2 <- c(11.2,9.3,6.5,4.1)
date3 <- c(8.2,9.1,4.3,4.4)
dr.df <- rbind(date1, date2, date3)
colnames(dr.df) <- a
dr.df <- as.data.frame(dr.df)
dr.df
# exam1 exam2 exam3 exam4
# date1 8.2 4.3 6.7 3.9
# date2 11.2 9.3 6.5 4.1
# date3 8.2 9.1 4.3 4.4
The "reshaping" step
You can now simply use stack to get the data in a long form.
dr.dfL <- data.frame(stack(dr.df), date = rownames(dr.df))
The values for the vector you want are in the "values" column, and the names for those values can be obtained using paste.
setNames(dr.dfL$values, paste(dr.dfL$ind, dr.dfL$date, sep = "_"))
# exam1_date1 exam1_date2 exam1_date3 exam2_date1 exam2_date2 exam2_date3
# 8.2 11.2 8.2 4.3 9.3 9.1
# exam3_date1 exam3_date2 exam3_date3 exam4_date1 exam4_date2 exam4_date3
# 6.7 6.5 4.3 3.9 4.1 4.4
Note that the result here is just a named vector, not a data.frame, as in the other answers.
You can use reshape from base R:
new <- reshape(dr, varying = list(c("date1","date2","date3")), direction = "long")
new$newname <- apply(new, 1, function(x) paste(x[1],paste("date",x[2],sep=""),sep="_"))
new <- new[,c("date1","newname")]
names(new) <- c("info","exam")
Outputs:
> new
info exam
1.1 8.2 exam1_date1
2.1 4.3 exam2_date1
3.1 6.7 exam3_date1
4.1 3.9 exam4_date1
1.2 11.2 exam1_date2
2.2 9.3 exam2_date2
3.2 6.5 exam3_date2
4.2 4.1 exam4_date2
1.3 8.2 exam1_date3
2.3 9.1 exam2_date3
3.3 4.3 exam3_date3
4.3 4.4 exam4_date3

Multiple columns of data and getting average R program

I asked a question like this before but I decided to simplify my data format because I'm very new at R and didnt understand what was going on....here's the link for the question How to handle more than multiple sets of data in R programming?
But I edited what my data should look like and decided to leave it like this..in this format...
X1.0 X X2.0 X.1
0.9 0.9 0.2 1.2
1.3 1.4 0.8 1.4
As you can see I have four columns of data, The real data I'm dealing with is up to 2000 data points.....Columns "X1.0" and "X2.0" refer "Time"...so what I want is the average of "X" and "X.1" every 100 seconds based on my 2 columns of time which are "X1.0" and "X2.0"...I can do it using this command
cuts <- cut(data$X1.0, breaks=seq(0, max(data$X1.0)+400, 400))
 by(data$X, cuts, mean)
But this will only give me the average from one set of data....which is "X1.0" and "X".....How will I do it so that I could get averages from more than one data set....I also want to stop having this kind of output
cuts: (0,400]
[1] 0.7
------------------------------------------------------------
cuts: (400,800]
[1] 0.805
Note that the output was done every 400 s....I really want a list of those cuts which are the averages at different intervals...please help......I just used data=read.delim("clipboard") to get my data into the program
It is a little bit confusing what output do you want to get.
First I change colnames but this is optional
colnames(dat) <- c('t1','v1','t2','v2')
Then I will use ave which is like by but with better output. I am using a trick of a matrix to index column:
matrix(1:ncol(dat),ncol=2) ## column1 is col1 adn col2...
[,1] [,2]
[1,] 1 3
[2,] 2 4
Then I am using this matrix with apply. Here the entire solution:
cbind(dat,
apply(matrix(1:ncol(dat),ncol=2),2,
function(x,by=10){ ## by 10 seconds! you can replace this
## with 100 or 400 in you real data
t.col <- dat[,x][,1] ## txxx
v.col <- dat[,x][,2] ## vxxx
ave(v.col,cut(t.col,
breaks=seq(0, max(t.col),by)),
FUN=mean)})
)
EDIT correct the cut and simplify the code
cbind(dat,
apply(matrix(1:ncol(dat),ncol=2),2,
function(x,by=10)ave(dat[,x][,1], dat[,x][,1] %/% by)))
X1.0 X X2.0 X.1 1 2
1 0.9 0.9 0.2 1.2 3.3000 3.991667
2 1.3 1.4 0.8 1.4 3.3000 3.991667
3 2.0 1.7 1.6 1.1 3.3000 3.991667
4 2.6 1.9 2.2 1.6 3.3000 3.991667
5 9.7 1.0 2.8 1.3 3.3000 3.991667
6 10.7 0.8 3.5 1.1 12.8375 3.991667
7 11.6 1.5 4.1 1.8 12.8375 3.991667
8 12.1 1.4 4.7 1.2 12.8375 3.991667
9 12.6 1.8 5.4 1.2 12.8375 3.991667
10 13.2 2.1 6.3 1.3 12.8375 3.991667
11 13.7 1.6 6.9 1.1 12.8375 3.991667
12 14.2 2.2 9.4 1.3 12.8375 3.991667
13 14.6 1.8 10.0 1.5 12.8375 10.000000

Resources