Related
I am trying to filter a large dataframe by ICD-10 codes.
I have the following dataframe as shown below.
prindx secdx1 secdx2 secdx3 secdx4 secdx5 secdx6 secdx7 secdx8 secdx9 secdx10 secdx11 secdx12 secdx13 secdx14 secdx15 secdx16 secdx17 secdx18 secdx19 secdx20 secdx21 secdx22 secdx23
1 S02118A R6520 J690 R403 A419 J151 J9621 E43 S32019A J15211 S22030A A047 T797XXA S22040A S32029A S32039A S32049A S32059A Y92413 X820XXA
9 S020XXB S066X9A S065X9A S064X9A S0102XA S82841A S62307A S52502A S52602A G9349 E872 D688 G910 J9601 T8119XA D62 G960 G9782 V5988XA Y92413 Y838
28 S14123A J9600 S069X9A J690 G8252 G9340 S0219XA S2232XA E872 E873 N179 E871 R1312 M62838 R410 S62521A B964 M4802 W130XXA Y9339 Y92038
44 S065X9A A4151 R6521 J690 J810 G40919 J90 L89152 E43 J155 J9600 T17590A E871 E872 I272 S066X9A S0219XA S061X9A Y9301 Y999 W108XXA
95 S0219XB J9600 J690 G9340 J95821 E871 R45851 D62 F1520 G960 G9782 R4701 E8351 G40909 S061X9A F0781 E876 K5900 X749XXA Y929 Y846
209 S12100A G9741 S062X0A N183 I129 R443 G4730 R410 Z96641 W109XXA Y838 Y92234
secdx24 secdx25 secdx26 secdx27 secdx28 secdx29 ID POISON
1 1 NULL
9 2 NULL
28 3 NULL
44 4 NULL
95 5 NULL
209 6 NULL
I am trying to select those that contain the following ICD codes "S52". However this is complicated further by the fact that I need those codes that have the 6th character of "1", "2", "3" or "4" only. For example S52602A If both of these conditions are met then I need a the column "POISON" value to equal "TRUE". Thanks!
Dput below
structure(list(prindx = c("S02118A", "S020XXB", "S14123A", "S065X9A",
"S0219XB", "S12100A", "S2243XA", "S32052A", "S0219XB", "S065X9A",
"S060X1A", "S060X9A", "S069X1A", "S065X0A", "S0210XA", "S72331A",
"S065X0A", "S0264XA", "S066X0A", "S066X0A"), secdx1 = c("R6520",
"S066X9A", "J9600", "A4151", "J9600", "G9741", "J9600", "S066X0A",
"I2699", "S066X9A", "R55", "F200", "S42292A", "I10", "J9601",
"S060X9A", "G935", "J690", "S12000A", "R4701"), secdx2 = c("J690",
"S065X9A", "S069X9A", "R6521", "J690", "S062X0A", "S270XXA",
"S37032A", "T794XXA", "J9601", "R112", "I10", "S42425A", "R531",
"S270XXA", "S42001A", "J9600", "G92", "E871", "I10"), secdx3 = c("R403",
"S064X9A", "J690", "J690", "G9340", "N183", "S069X9A", "S52572A",
"I82411", "S270XXA", "Y939", "S01112A", "V4959XA", "E669", "S066X5A",
"K5900", "R569", "Z6843", "E8342", "E780"), secdx4 = c("A419",
"S0102XA", "G8252", "J810", "J95821", "I129", "S25391A", "I252",
"E440", "J150", "W1830XA", "S022XXA", "Y92488", "Z6835", "G931",
"S8012XA", "D62", "E871", "B370", "R739"), secdx5 = c("J151",
"S82841A", "G9340", "G40919", "E871", "R443", "S3210XA", "E039",
"E872", "S22049A", "Y92011", "R40241", "", "W19XXXA", "R1312",
"S8011XA", "E8342", "D62", "F10129", "S065X0A"), secdx6 = c("J9621",
"S62307A", "S0219XA", "J90", "R45851", "G4730", "E872", "R0781",
"D62", "S02413A", "", "S0101XA", "", "", "S27321A", "R402352",
"E872", "J9601", "F1510", "R001"), secdx7 = c("E43", "S52502A",
"S2232XA", "L89152", "D62", "R410", "I959", "R0902", "J9602",
"S27322A", "", "H1130", "", "", "D62", "R402132", "I69354", "J9811",
"D509", "R079"), secdx8 = c("S32019A", "S52602A", "E872", "E43",
"F1520", "Z96641", "S27321A", "I4510", "J9601", "S22059A", "",
"E119", "", "", "G8191", "R402242", "F10239", "Z430", "F17210",
"E876"), secdx9 = c("J15211", "G9349", "E873", "J155", "G960",
"W109XXA", "S42322A", "E8809", "E871", "C9591", "", "Z23", "",
"", "S02401A", "V4362XA", "R1310", "S01411A", "S0219XA", "E8342"
), secdx10 = c("S22030A", "E872", "N179", "J9600", "G9782", "Y838",
"S0451XA", "F17210", "I824Z2", "J9811", "", "F319", "", "", "S023XXA",
"Y92410", "I10", "S1181XA", "R739", "W1830XA"), secdx11 = c("A047",
"D688", "E871", "T17590A", "R4701", "Y92234", "D62", "R312",
"J9811", "S0219XA", "", "S8251XA", "", "", "S065X5A", "", "F17210",
"E669", "G8911", "Y9321"), secdx12 = c("T797XXA", "G910", "R1312",
"E871", "E8351", "", "S52615A", "Z955", "G8101", "S01512A", "",
"S8010XA", "", "", "S062X5A", "", "S062X0A", "Z713", "R001",
"Y9289"), secdx13 = c("S22040A", "J9601", "M62838", "E872", "G40909",
"", "S66022A", "K219", "I82611", "H02209", "", "Z720", "", "",
"F0781", "", "E876", "L0889", "T50995A", ""), secdx14 = c("S32029A",
"T8119XA", "R410", "I272", "S061X9A", "", "F19921", "R413", "J449",
"J9809", "", "Y00XXXA", "", "", "D6959", "", "W010XXA", "B9562",
"W108XXA", ""), secdx15 = c("S32039A", "D62", "S62521A", "S066X9A",
"F0781", "", "S52272A", "V892XXA", "R1310", "S020XXA", "", "Y9389",
"", "", "F1210", "", "Y92003", "T4275XA", "Y92009", ""), secdx16 = c("S32049A",
"G960", "B964", "S0219XA", "E876", "", "S022XXA", "", "Z781",
"H1131", "", "Y9289", "", "", "R471", "", "", "S069X0A", "Y908",
""), secdx17 = c("S32059A", "G9782", "M4802", "S061X9A", "K5900",
"", "S42021A", "", "S062X6A", "R338", "", "", "", "", "H532",
"", "", "I10", "", ""), secdx18 = c("Y92413", "V5988XA", "W130XXA",
"Y9301", "X749XXA", "", "V4362XA", "", "X72XXXA", "W1809XA",
"", "", "", "", "V4988XA", "", "", "E8351", "", ""), secdx19 = c("X820XXA",
"Y92413", "Y9339", "Y999", "Y929", "", "Y939", "", "Y92009",
"Y990", "", "", "", "", "Y92411", "", "", "R739", "", ""), secdx20 = c("",
"Y838", "Y92038", "W108XXA", "Y846", "", "Y92411", "", "", "",
"", "", "", "", "Y9389", "", "", "D638", "", ""), secdx21 = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"G4733", "", ""), secdx22 = c("", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "G8911", "", ""), secdx23 = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"F1510", "", ""), secdx24 = c("", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "V535XXA", "", ""), secdx25 = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"Y92413", "", ""), secdx26 = c("", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "Y92230", "", ""), secdx27 = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", ""), secdx28 = c("", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", ""), secdx29 = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", ""), ID = 1:20, POISON = c("NULL", "NULL", "NULL", "NULL",
"NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL",
"NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL"
)), row.names = c(1L, 9L, 28L, 44L, 95L, 209L, 286L, 363L, 506L,
608L, 628L, 699L, 760L, 778L, 905L, 909L, 1018L, 1027L, 1126L,
1315L), class = "data.frame")
If I'm understanding correctly, this looks in every column (everything()) and returns TRUE if any column (if_any()) matches the regular expression.
^S52..[1234] means that the value starts (^) with S52, then has two characters that can be anything (. and .), which is a total of 5 characters. Then it looks in the 6th character for any of 1, 2, 3, or 4.
library(dplyr)
library(stringr)
your_df %>%
mutate(POISON = if_any(everything(), ~str_detect(., "^S52..[1234]")))
Results stored in the POISON column:
[1] FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
[12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Additional details on regex, based on a comment:
To modify a regex literal, like abc to include both "abc" and "def", you'd use (abc|def). So to include other prefixes, you could use ^(S52|S53|S54)..[01234]. This particular example is the same as ^S5[234]..[01234], and also ^S5[2-4]..[1-4]. Choose the flavor you find readable.
I am currently working on a data project of mentions within tweets. The data is in CSV format, with each row being an individual tweet, and the variables being "User", "mention_1", "mention_2"...."mention_12". For example, User "AAA" retweeted User BBB and User CCC in one tweet, didn't retweet anyone else in another, and then mentioned both CCC and BBB in individual tweets.
User
Mention 1
Mention 2
...
Mention 12
AAA
BBB
CCC
...
NA
AAA
Blank
AAA
CCC
AAA
BBB
BBB
AAA
BBB
etc
Mention 1-12 is the name of the user being mentioned in the original tweet. Some tweets have up to twelve mentions, and some don't have any.
I am attempting to transform this data into an adjacency matrix of the form:
User 1
User 2
...
User N
User 1
#of mentions
#of mentions
...
#of mentions
User 2
#of mentions
#of mentions
...
#of mentions
User N
#of mentions
#of mentions
...
#of mentions
The Y and X axises are the names of the users tweeting each other, the values of the matrix are the number of mentions between the two users, with the diagonal being the number of times a user mentioned themselves.
I am attempting to create this matrix for network analysis with ERGMs, but I can't figure out how to transform the data without manually counting and filling in the matrix. Given that I have over 6000 rows, manual entry is not viable.
Does anyone know how to transform this table into an adjacency matrix in either R or excel?
Thank you in advance.
EDIT #1
Output of dput(head(df,20))
structure(list(ï..Ref = 1:20, user = c("Ziad Aboultaif", "Ziad Aboultaif", "Ziad Aboultaif", "Ziad Aboultaif", "Ziad Aboultaif","Ziad Aboultaif", "Ziad Aboultaif", "Ziad Aboultaif","Ziad Aboultaif", "Ziad Aboultaif", "Ziad Aboultaif", "Ziad Aboultaif","Ziad Aboultaif", "Ziad Aboultaif", "Ziad Aboultaif","Warren Steinley", "Warren Steinley", "Warren Steinley","Warren Steinley", "Warren Steinley"), Mention.0 =c("Candice Bergen", "Candice Bergen", "Dan Albas", "Erin O'Toole","Kerry-Lynne Findlay", "pierrepoilievre", "pierrepoilievre", "pierrepoilievre", "", "", "", "", "", "", "", "Melissa Lantsman","Melissa Lantsman", "", "", ""), Mention.1 = c("", "", "","", "", "", "", "", "Ziad Aboultaif", "", "", "", "GarnettGenuis", "Ziad Aboultaif", "Ziad Aboultaif", "", "", "Candice Bergen","Candice Bergen", ""), Mention.2 = c("", "", "", "", "", "", "", "", "", "","", "", "Dr. Stephen Ellis", "", "", "", "", "", "", ""),Mention.3= c("", "", "", "", "", "", "", "", "", "", "", "","Ziad Aboultaif", "", "", "", "", "", "", ""), Mention.4 = c("", "", "","","", "", "", "", "", "", "", "", "", "", "", "", "", "", "",""),Mention.5 = c(NA,NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,NA, NA, NA, NA, NA,NA, NA, NA), Mention.6 = c("", "", "", "", "","", "", "", "", "", "", "", "", "", "", "", "", "", "", ""),Mention.7 = c("","", "", "", "", "", "", "", "", "", "", "", "","", "", "", "", "", "", ""), Mention.8 = c("", "", "", "", "", "","", "", "", "", "", "", "", "", "", "", "", "", "", ""), Mention.9= c("","", "", "", "", "", "", "", "", "", "", "", "", "","", "", "", "", "", ""), Mention.10 = c("", "", "", "", "", "","","", "", "", "", "", "", "", "", "", "", "", "", ""), Mention.11= c("", "", "", "", "", "", "", "", "", "", "", "", "", "","", "", "", "", "", ""), Mention.12 = c("", "", "", "", "", "","","", "", "", "", "", "", "", "", "", "", "", "", "")), row.names= c(NA, 20L), class = "data.frame")
Edit 2: Current script
library(readr)
library(tidyverse)
library(igraph)
library(ergm)
df <- read.csv("CPC Retweets.csv", stringsAsFactors = FALSE)
str(df)
refs <- (dput(df))
#Count number of tweets per user
df2<-xtabs(~user,data = df)
#Create list of users in data frame
users <- unique(unlist(df$user))
#Create mention variable
men <- df$mentions
umen <- unique(unlist(men))
#head(umen)
#Create Adjacency Matrix
mat <- matrix(0,length(users), length(users))
rownames(mat) <- users
colnames(mat) <- users
mat[1:6,1:6]
# fill in matrix by looping through each tweet
for(t in 1:length(users)){
#select mentions
mention <- men[[t]]
#skip if 0 mentions
#if(length(mention) == 0) next()
#add plus one to the current value in adj matrix
mat[users,users] <- mat[users,users] + 1
}
rm(t)
Perhaps this:
xtabs(~ User + value, data = reshape2::melt(refs, "User"))
# value
# User AAA BBB CCC
# AAA 4 0 2 2
# BBB 3 1 0 0
Data
refs <- structure(list(User = c("AAA", "AAA", "AAA", "AAA", "BBB", "BBB"), Mention.1 = c("BBB", "", "CCC", "", "AAA", ""), Mention.2 = c("CCC", "", "", "BBB", "", "")), class = "data.frame", row.names = c(NA, -6L))
Found the solution after some diving into forums. Here is for anyone with a similar problem,
library(reshape2)
#create matrix
adj_mat <- dcast(
data = df,
formula = user ~ mentions,
drop = F
)
#create adjacency matrix
tweets <- as.matrix(adj_mat)
I had to bring all the mentions into a single column called "mentions". I could not find a solution to the problem unless the data was dyadic, and so there was some work in excel to make this solution work.
I read in the table using the following code:
Data<- read.table("1mo.txt", header = TRUE, sep = "\t", stringsAsFactors = F)
Some columns have fewer entries. The problem arises when I try to calculate overlap and intersections, I get the blank value as a common value. So how can I exclude these blanks values without removing the entire column or row?
I am using RVenn for making the Venn diagram
Dat<-c(Data)
Test = Venn(Dat)
overlap(Test)
overlap(Test, c(1,2,3,4,6,7))
setmap(Test)
Below is the data:
dput(Data)
structure(list(W = c("rno-let-7d-3p", "rno-let-7g-5p", "rno-miR-10b-5p",
"rno-miR-125b-5p", "rno-miR-127-3p", "rno-miR-133a-3p", "rno-miR-192-5p",
"rno-miR-196b-5p", "rno-miR-223-3p", "rno-miR-22-5p", "rno-miR-26b-5p",
"rno-miR-29c-3p", "rno-miR-29c-5p", "rno-miR-30c-5p", "rno-miR-30d-5p",
"rno-miR-30e-5p", "rno-miR-322-5p", "rno-miR-330-3p", "rno-miR-340-3p",
"rno-miR-3559-5p", "rno-miR-378a-3p", "rno-miR-378b", "rno-miR-383-5p",
"rno-miR-429", "rno-miR-451-3p", "rno-miR-499-5p", "rno-miR-542-5p",
"rno-miR-6328", "rno-miR-652-3p", "", "", "", "", "", "", ""),
Ni = c("rno-miR-125b-1-3p", "rno-miR-127-3p", "rno-miR-133a-3p",
"rno-miR-152-3p", "rno-miR-17-5p", "rno-miR-192-5p", "rno-miR-196b-5p",
"rno-miR-200a-3p", "rno-miR-20a-5p", "rno-miR-30d-5p", "rno-miR-322-5p",
"rno-miR-3559-5p", "rno-miR-499-5p", "rno-miR-505-3p", "rno-miR-542-5p",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", ""), Co = c("rno-miR-107-3p", "rno-miR-133a-3p",
"rno-miR-196b-5p", "rno-miR-203a-3p", "rno-miR-24-2-5p",
"rno-miR-26b-5p", "rno-miR-423-3p", "rno-miR-500-3p", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", ""), Fe = c("rno-miR-127-3p",
"rno-miR-133a-3p", "rno-miR-152-3p", "rno-miR-17-5p", "rno-miR-192-5p",
"rno-miR-196b-5p", "rno-miR-200a-3p", "rno-miR-30a-5p", "rno-miR-30c-5p",
"rno-miR-30d-5p", "rno-miR-30e-5p", "rno-miR-322-5p", "rno-miR-3559-5p",
"rno-miR-499-5p", "rno-miR-542-3p", "rno-miR-542-5p", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", ""), Cu = c("rno-miR-127-3p", "rno-miR-133a-3p",
"rno-miR-17-5p", "rno-miR-192-5p", "rno-miR-196b-5p", "rno-miR-1b",
"rno-miR-200a-3p", "rno-miR-200c-3p", "rno-miR-22-5p", "rno-miR-30a-5p",
"rno-miR-30d-5p", "rno-miR-320-3p", "rno-miR-322-5p", "rno-miR-3559-5p",
"rno-miR-499-5p", "rno-miR-500-3p", "rno-miR-542-3p", "rno-miR-542-5p",
"rno-miR-6216", "rno-miR-872-5p", "rno-miR-873-3p", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", ""), Al = c("rno-let-7a-1-3p",
"rno-let-7c-2-3p", "rno-miR-127-3p", "rno-miR-133a-3p", "rno-miR-17-5p",
"rno-miR-181d-3p", "rno-miR-191b", "rno-miR-192-5p", "rno-miR-196b-5p",
"rno-miR-200a-3p", "rno-miR-205", "rno-miR-29c-5p", "rno-miR-30d-5p",
"rno-miR-30e-5p", "rno-miR-322-5p", "rno-miR-3559-5p", "rno-miR-383-5p",
"rno-miR-434-3p", "rno-miR-499-5p", "rno-miR-542-3p", "rno-miR-542-5p",
"rno-miR-652-3p", "", "", "", "", "", "", "", "", "", "",
"", "", "", ""), Pb = c("rno-let-7a-1-3p", "rno-let-7c-2-3p",
"rno-let-7f-5p", "rno-miR-122-3p", "rno-miR-126a-3p", "rno-miR-127-3p",
"rno-miR-133a-3p", "rno-miR-144-3p", "rno-miR-148a-5p", "rno-miR-17-5p",
"rno-miR-192-5p", "rno-miR-195-5p", "rno-miR-196b-5p", "rno-miR-199a-3p",
"rno-miR-200a-3p", "rno-miR-200b-3p", "rno-miR-203b-3p",
"rno-miR-20a-5p", "rno-miR-29a-3p", "rno-miR-29b-3p", "rno-miR-29c-5p",
"rno-miR-30a-5p", "rno-miR-30d-5p", "rno-miR-30e-3p", "rno-miR-30e-5p",
"rno-miR-322-5p", "rno-miR-3559-5p", "rno-miR-383-5p", "rno-miR-423-5p",
"rno-miR-450b-3p", "rno-miR-484", "rno-miR-499-5p", "rno-miR-503-3p",
"rno-miR-542-3p", "rno-miR-542-5p", "rno-miR-99a-5p"), DU = c("rno-miR-122-3p",
"rno-miR-124-3p", "rno-miR-125a-5p", "rno-miR-125b-5p", "rno-miR-126a-3p",
"rno-miR-127-3p", "rno-miR-133a-3p", "rno-miR-184", "rno-miR-192-5p",
"rno-miR-196b-5p", "rno-miR-200a-3p", "rno-miR-200b-3p",
"rno-miR-27a-5p", "rno-miR-29b-3p", "rno-miR-29c-5p", "rno-miR-30a-5p",
"rno-miR-30d-5p", "rno-miR-30e-5p", "rno-miR-322-5p", "rno-miR-342-5p",
"rno-miR-3559-5p", "rno-miR-375-3p", "rno-miR-383-5p", "rno-miR-451-3p",
"rno-miR-484", "rno-miR-499-5p", "rno-miR-542-3p", "rno-miR-542-5p",
"", "", "", "", "", "", "", "")), class = "data.frame", row.names = c(NA,
-36L))
Very hard to tell from your example above, try something like this, I write something that's like your data:
x = structure(list(W = c("rno-miR-340-3p", "rno-miR-340-3p", "rno-miR-133a-3p"
), Ni = c("rno-miR-133a-3p", "rno-miR-133a-3p", "rno-miR-500-3p"
), Co = c("rno-miR-500-3p", "rno-miR-500-3p", "rno-miR-196b-5p"
), Fe = c("rno-miR-196b-5p", "rno-miR-196b-5p", ""), Cu = c("rno-miR-133a-3p",
"rno-miR-133a-3p", ""), Al = c("rno-let-7c-2-3p", "", "")), class = "data.frame", row.names = c(NA,
-3L))
write.table(x,"test.txt",quote=FALSE,sep="\t",row.names=FALSE)
And I read it in it, similar to what you have:
Data = read.table("test.txt",sep="\t",header=TRUE)
Data
W Ni Co Fe
1 rno-miR-340-3p rno-miR-133a-3p rno-miR-500-3p rno-miR-196b-5p
2 rno-miR-340-3p rno-miR-133a-3p rno-miR-500-3p rno-miR-196b-5p
3 rno-miR-133a-3p rno-miR-500-3p rno-miR-196b-5p
Cu Al
1 rno-miR-133a-3p rno-let-7c-2-3p
2 rno-miR-133a-3p
3
One way is to fill in the blanks as NA:
Data = read.table("test.txt",sep="\t",header=TRUE,fill=TRUE,na.strings="",stringsAsFactors=FALSE)
Data
W Ni Co Fe
1 rno-miR-340-3p rno-miR-133a-3p rno-miR-500-3p rno-miR-196b-5p
2 rno-miR-340-3p rno-miR-133a-3p rno-miR-500-3p rno-miR-196b-5p
3 rno-miR-133a-3p rno-miR-500-3p rno-miR-196b-5p <NA>
Cu Al
1 rno-miR-133a-3p rno-let-7c-2-3p
2 rno-miR-133a-3p <NA>
3 <NA> <NA>
Then if you wanna do the super venn diagram, you want to iterate through the columns, and omit the NAs:
library(RVenn)
ggvenn(Venn(sapply(Data,na.omit)[1:3]))
I tested a new data that is provided from uI, so i used new.csv file which has same column name as trained model.
What is the problem with this?
i get error in this line classify_models(container2,maxent_model)
load("maxent.rda")
new = read_csv("new.csv")
new$data <- paste(new$Title,new$Body)
new$data = as.character(new$data)
new$data = as.character(new$data)
new$data[nrow(new)] = as.character(input$caption)
new$data = toupper(new$data)
new$data = gsub("<.*?>", "", as.character(new$data))
new$data = gsub("&", "", new$data)
new$data = gsub("(RT|via)((?:\\b\\W*#\\w+)+)", "", new$data)
new$data = gsub("#\\w+", "", new$data)
new$data = gsub("[[:punct:]]", "", new$data)
new$data = gsub("[[:digit:]]", "", new$data)
new$data = gsub("http\\w+", "", new$data)
new$data = gsub("[ \t]{2,}", "", new$data)
new$data = gsub("^\\s+|\\s+$", "", new$data)
#write.csv(new, "new_data1.csv", row.names = FALSE)
matrix2 <- create_matrix(new["data"], language="english",
weighting=tm::weightTfIdf)
container2 <- create_container(matrix2, new$TagId, trainSize=NULL,
testSize=1:nrow(new), virgin=TRUE)
maxent_results2 <- classify_models(container2,maxent_model)
mydata = data.frame(new,maxent_results2)
I am trying to get the results of that form with httr.
Having looked the form results, I tried the following:
library(httr)
library(stringr)
r = str_c("http://www.memoiredeshommes.sga.defense.gouv.fr/fr/arkotheque/",
"client/mdh/base_morts_pour_la_france_premiere_guerre/index.php")
q = list(
"action" = 1,
"todo" = "rechercher",
"le_id" = "",
"multisite" = "",
"r_c_nom" = "mo",
"r_c_nom_like" = 1,
"r_c_prenom" = "",
"r_c_prenom_like" = 1,
"r_c_naissance_jour_mois_annee_jj_debut" = "",
"r_c_naissance_jour_mois_annee_mm_debut" = "",
"r_c_naissance_jour_mois_annee_yyyy_debut" = 1890,
"r_c_naissance_jour_mois_annee_jj_fin" = "",
"r_c_naissance_jour_mois_annee_mm_fin" = "",
"r_c_naissance_jour_mois_annee_yyyy_fin" = "",
"r_c_id_naissance_departement" = "",
"hidden_c_id_naissance_departement" = "",
"r_c_id_naissance_pays" = "",
"hidden_c_id_naissance_pays" = "",
"r_annot_c_id_grade" = "",
"hidden_c_id_grade" = "",
"r_annot_c_id_unite" = "",
"hidden_c_id_unite" = "",
"r_annot_c_id_recrutement_bureau" = "",
"hidden_c_id_recrutement_bureau" = "",
"r_annot_c_classe" = "",
"r_annot_c_recrutement_matricule" = "",
"r_annot_c_id_naissance_lieu" = "",
"hidden_c_id_naissance_lieu" = "",
"r_annot_c_deces_jour_mois_annee_jj_debut" = "",
"r_annot_c_deces_jour_mois_annee_mm_debut" = "",
"r_annot_c_deces_jour_mois_annee_yyyy_debut" = "",
"r_annot_c_deces_jour_mois_annee_jj_fin" = "",
"r_annot_c_deces_jour_mois_annee_mm_fin" = "",
"r_annot_c_deces_jour_mois_annee_yyyy_fin" = "",
"r_annot_c_id_deces_lieu" = "",
"hidden_c_id_deces_lieu" = "",
"r_annot_c_deces_lieu_complement" = "",
"r_annot_c_deces_lieu_complement_like" = 1,
"r_annot_c_id_deces_departement" = "",
"hidden_c_id_deces_departement" = "",
"r_annot_c_id_deces_pays" = "",
"hidden_c_id_deces_pays" = "",
"r_annot_c_id_transcription_etablissement_lieu" = "",
"hidden_c_id_transcription_etablissement_lieu" = "",
"r_annot_c_id_transcription_etablissement_departement" = "",
"hidden_c_id_transcription_etablissement_departement" = "",
"r_annot_c_id_transcription_etablissement_pays" = "",
"hidden_c_id_transcription_etablissement_pays" = ""
)
t = GET(r, query = q, verbose())
writeLines(content(t, "text", encoding = "UTF-8"), "~/Desktop/test.html")
… which is not working at all (all I get is NA).
What am I doing wrong?
You could try it like this
library(rvest)
html_session(url) %>%
rvest:::request_POST(url, body = q, encode = "form") %>%
read_html %>%
html_table
# [[1]]
# Nom Prénom(s) Date de naissance Département/Pays de naissance Détail Images Panier Lien Fiche annotée
# 1 MOAL Alain Marc 10-08-1890 29 - Finistère Détail Visualiser Panier Ark oui
# 2 MOAL Jean 22-12-1890 29 - Finistère Détail Visualiser Panier Ark oui
# 3 MOAL Joseph Marie 29-04-1890 29 - Finistère Détail Visualiser Panier Ark oui
# 4 MOALIC Pierre Joseph Marie 05-04-1890 29 - Finistère Détail Visualiser Panier Ark oui
# ...