I am trying to collect transcripts of conference calls from Seeking Alpha for a research project (I am a PhD student). Now, I have found a code online to extract the transcripts and store it in a .json file. I adjusted the code already to rotate user agents. However, the code only extracts the first page of the conference call transcript because of the following:
body = response.css('div#a-body p.p1')
chunks = body.css('p.p1')
The pages are represented by a series of <p> elements, with the class .p1 .p2 .p3 etc. that indicate the page numbers. I have already tried a number of things such as replacing the above code with:
response.xpath('//div[#id="a-body"]/p')
but I have not been able to extract the full conference call transcript (only the first page). Below is the full code:
import scrapy
# This enum lists the stages of each transcript.
from enum import Enum
import random
# SRC: https://developers.whatismybrowser.com/useragents/explore/
user_agent_list = [
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/37.0.2062.94 Chrome/37.0.2062.94 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
#Firefox
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]
Stage = Enum('Stage', 'preamble execs analysts body')
# Some transcript preambles are concatenated on a single line. This list is used
# To separate the title and date sections of the string.
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
transcripts = {}
class TranscriptSpider(scrapy.Spider):
name = 'transcripts'
custom_settings = {
'DOWNLOAD_DELAY': 2 # 0.25 == 250 ms of delay, 1 == 1000ms of delay, etc.
}
start_urls = ['http://seekingalpha.com/earnings/earnings-call-transcripts/1']
def parse(self, response):
# Follows each transcript page's link from the given index page.
for href in response.css('.dashboard-article-link::attr(href)').extract():
user_agent = random.choice(user_agent_list)
yield scrapy.Request(response.urljoin(href), callback=self.parse_transcript,headers={'User-Agent': user_agent})
# Follows the pagination links at the bottom of given index page.
next_page = response.css('li.next a::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
def parse_transcript(self, response):
i = 0
transcript = {}
details = {}
execs = []
analysts = []
script = []
mode = 1
# As the pages are represented by a series of `<p>` elements we have to do this the
# old-fashioned way - breaking it into chunks and iterating over them.
body = response.css('div#a-body p.p1')
chunks = body.css('p.p1')
while i < len(chunks):
# If the current line is a heading and we're not currently going
# through the transcript body (where headings represent speakers),
# change the current section flag to the next section.
if (len(chunks[i].css('strong::text').extract()) == 0) or (mode == 4):
currStage = Stage(mode)
# If we're on the preamble stage, each bit of data is extracted
# separately as they all have their own key in the JSON.
if currStage == Stage['preamble']:
# If we're on the first line of the preamble, that's the
# company name, stock exchange and ticker acroynm (or should
# be - see below)
if i == 0:
# Checks to see if the second line is a heading. If not,
# everything is fine.
if len(chunks[1].css('strong::text').extract()) == 0:
details['company'] = chunks[i].css('p::text').extract_first()
if " (" in details['company']:
details['company'] = details['company'].split(' (')[0]
# If a specific stock exchange is not listed, it
# defaults to NYSE
details['exchange'] = "NYSE"
details['ticker'] = chunks.css('a::text').extract_first()
if ":" in details['ticker']:
ticker = details['ticker'].split(':')
details['exchange'] = ticker[0]
details['ticker'] = ticker[1]
# However, if it is, that means this line contains the
# full, concatenated preamble, so everything must be
# extracted here
else:
details['company'] = chunks[i].css('p::text').extract_first()
if " (" in details['company']:
details['company'] = details['company'].split(' (')[0]
# if a specific stock exchange is not listed, default to NYSE
details['exchange'] = "NYSE"
details['ticker'] = chunks.css('a::text').extract_first()
if ":" in details['ticker']:
ticker = details['ticker'].split(':')
details['exchange'] = ticker[0]
details['ticker'] = ticker[1]
titleAndDate = chunks[i].css('p::text').extract[1]
for date in months:
if date in titleAndDate:
splits = titleAndDate.split(date)
details['title'] = splits[0]
details['date'] = date + splits[1]
# Otherwise, we're onto the title line.
elif i == 1:
title = chunks[i].css('p::text').extract_first()
# This should never be the case, but just to be careful
# I'm leaving it in.
if len(title) <= 0:
title = "NO TITLE"
details['title'] = title
# Or the date line.
elif i == 2:
details['date'] = chunks[i].css('p::text').extract_first()
# If we're onto the 'Executives' section, we create a list of
# all of their names, positions and company name (from the
# preamble).
elif currStage == Stage['execs']:
anExec = chunks[i].css('p::text').extract_first().split(" - ")
# This covers if the execs are separated with an em- rather
# than an en-dash (see above).
if len(anExec) <= 1:
anExec = chunks[i].css('p::text').extract_first().split(" – ")
name = anExec[0]
if len(anExec) > 1:
position = anExec[1]
# Again, this should never be the case, as an Exec-less
# company would find it hard to get much done.
else:
position = ""
execs.append((name,position,details['company']))
# This does the same, but with the analysts (which never seem
# to be separated by em-dashes for some reason).
elif currStage == Stage['analysts']:
name = chunks[i].css('p::text').extract_first().split(" - ")[0]
company = chunks[i].css('p::text').extract_first().split(" - ")[1]
analysts.append((name,company))
# This strips the transcript body of everything except simple
# HTML, and stores that.
elif currStage == Stage['body']:
line = chunks[i].css('p::text').extract_first()
html = "p>"
if line is None:
line = chunks[i].css('strong::text').extract_first()
html = "h1>"
script.append("<"+html+line+"</"+html)
else:
mode += 1
i += 1
# Adds the various arrays to the dictionary for the transcript
details['exec'] = execs
details['analysts'] = analysts
details['transcript'] = ''.join(script)
# Adds this transcript to the dictionary of all scraped
# transcripts, and yield that for the output
transcript["entry"] = details
yield transcript
I have been stuck on this for a week now (still new to Python and web scraping) so it would be great if someone brighter than me could take a look!
It seems that the transcripts are organized in various pages.
So, I think that you have to add to your parse_transcript method a part where you find the link to next page of transcript, then you open it and submit it to parse_transcript.
Something like this:
# Follows the pagination links at the bottom of transcript page.
next_page = response.css(YOUR CSS SELECTOR GOES HERE).extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse_transcript)
Obviously, your have to modify your parse_transcript method to parse not only paragraphs extracted from the first page. You have to make this part more general:
body = response.css('div#a-body p.p1')
chunks = body.css('p.p1')
I can't make a request with Erlang/Cowboy at all. I can make one from the erlang shell but not when running a cowboy release. I've tried using the 'hackney' library as well:
hackney:start(),
{ok, _, _, Ref} = hackney:request(
get, <<"http://www.youtube.com">>, [], <<>>, [{pool, default}]
),
{ok, Body} = hackney:body(Ref),
io:format("body: ~p~n~n", [Body]),
Error:
Error in process <0.361.0> on node 'cta_erlang_backend#127.0.0.1' with exit value:
{[{reason,undef},
{mfa,{hello_handler,handle,2}},
{stacktrace,[{hackney,start,[],[]},
{hello_handler,handle,2,
[{file,"src/hello_handler.erl"},{line,18}]},
{cowboy_handler,handler_handle,4,
[{file,"src/cowboy_handler.erl"},{line,111}]},
{cowboy_protocol,execute,4,
[{file,"src/cowboy_protocol.erl"},
{line,442}]}]},
{req,[{socket,#Port<0.267>},
{transport,ranch_tcp},
{connection,keepalive},
{pid,<0.361.0>},
{method,<<"POST">>},
{version,'HTTP/1.1'},
{peer,{{10,0,0,1},40049}},
{host,<<"10.0.0.103">>},
{host_info,undefined},
{port,8080},
{path,<<"/">>},
{path_info,undefined},
{qs,<<>>},
{qs_vals,undefined},
{bindings,[]},
{headers,[{<<"host">>,<<"10.0.0.103:8080">>},
{<<"connection">>,<<"keep-alive">>},
{<<"content-length">>,<<"4">>},
{<<"cache-control">>,<<"no-cache">>},
{<<"origin">>,
<<"chrome-extension://fdmmgilgnpjigdojojpjoooidkmcomcm">>},
{<<"user-agent">>,
<<"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/39.0.2171.65 Chrome/39.0.2171.65 Safari/537.36">>},
{<<"content-type">>,<<"text/plain;charset=UTF-8">>},
{<<"accept">>,<<"*/*">>},
{<<"accept-encoding">>,<<"gzip, deflate">>},
{<<"accept-language">>,<<"en-GB,en-US;q=0.8,en;q=0.6">>}]},
{p_headers,[{<<"connection">>,[<<"keep-alive">>]}]},
{cookies,undefined},
{meta,[]},
{body_state,waiting},
{buffer,<<"asdf">>},
{multipart,undefined},
{resp_compress,false},
{resp_state,waiting},
{resp_headers,[]},
{resp_body,<<>>},
{onresponse,undefined}]},
{state,{state}}],
[{cowboy_protocol,execute,4,[{file,"src/cowboy_protocol.erl"},{line,442}]}]}
=ERROR REPORT==== 19-Oct-2016::18:56:51 ===
Ranch listener my_http_listener had connection process started with cowboy_protocol:start_link/4 at <0.361.0> exit with reason:
{[{reason,undef},{mfa,{hello_handler,handle,2}},{stacktrace,[{hackney,start,[],[]},{hello_handler,handle,2,[{file,"src/hello_handler.erl"},{line,18}]},{cowboy_handler,handler_handle,4,[{file,"src/cowboy_handler.erl"},{line,111}]},{cowboy_protocol,execute,4,[{file,"src/cowboy_protocol.erl"},{line,442}]}]},{req,[{socket,#Port<0.267>},{transport,ranch_tcp},{connection,keepalive},{pid,<0.361.0>},{method,<<"POST">>},{version,'HTTP/1.1'},{peer,{{10,0,0,1},40049}},{host,<<"10.0.0.103">>},{host_info,undefined},{port,8080},{path,<<"/">>},{path_info,undefined},{qs,<<>>},{qs_vals,undefined},{bindings,[]},{headers,[{<<"host">>,<<"10.0.0.103:8080">>},{<<"connection">>,<<"keep-alive">>},{<<"content-length">>,<<"4">>},{<<"cache-control">>,<<"no-cache">>},{<<"origin">>,<<"chrome-extension://fdmmgilgnpjigdojojpjoooidkmcomcm">>},{<<"user-agent">>,<<"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/39.0.2171.65 Chrome/39.0.2171.65 Safari/537.36">>},{<<"content-type">>,<<"text/plain;charset=UTF-8">>},{<<"accept">>,<<"*/*">>},{<<"accept-encoding">>,<<"gzip, deflate">>},{<<"accept-language">>,<<"en-GB,en-US;q=0.8,en;q=0.6">>}]},{p_headers,[{<<"connection">>,[<<"keep-alive">>]}]},{cookies,undefined},{meta,[]},{body_state,waiting},{buffer,<<"asdf">>},{multipart,undefined},{resp_compress,false},{resp_state,waiting},{resp_headers,[]},{resp_body,<<>>},{onresponse,undefined}]},{state,{state}}],[{cowboy_protocol,execute,4,[{file,"src/cowboy_protocol.erl"},{line,442}]}]}
hello_handler.erl:
-module(hello_handler).
-behaviour(cowboy_http_handler).
-export([init/3]).
-export([handle/2]).
-export([terminate/3]).
-record(state, {
}).
init(_, Req, _Opts) ->
hackney:start(),
{ok, Req, #state{}}.
handle(Req, State) ->
{Method, Req2} = cowboy_req:method(Req),
case Method of
<<"POST">> ->
{ok, _, _, Ref} = hackney:request(get, <<"http://www.youtube.com">>,
[], <<>>, [{pool, default}]),
{ok, Body} = hackney:body(Ref),
io:format("body: ~p~n~n", [Body]),
ResponseBody = <<"Hello Erl POST!">>;
<<"GET">> ->
ResponseBody = <<"Hello Erlang1!">>
end,
{ok, Req2} = cowboy_req:reply(200,
[{<<"content-type">>, <<"text/plain">>}],
ResponseBody,
Req),
{ok, Req2, State}.
terminate(_Reason, _Req, _State) ->
ok.
{[{reason,undef},
{mfa,{hello_handler,handle,2}},
{stacktrace,[{hackney,start,[],[]},
{hello_handler,handle,2,
[{file,"src/hello_handler.erl"},{line,18}]},
{cowboy_handler,handler_handle,4,
[{file,"src/cowboy_handler.erl"},{line,111}]},
{cowboy_protocol,execute,4,
[{file,"src/cowboy_protocol.erl"},
{line,442}]}]},
Crash at cowboy_handler.erl 111 line, https://github.com/ninenines/cowboy/blob/1.1.x/src/cowboy_handler.erl#L111
Reason: hello_handler:handle/2 is undef
So
Make sure your hello_handler.erl in src dir;
Compile it with rebar compile;
restart server or l(hello_handler) in erlang shell
I need to program an application with Delphi that goes into this site and uses the form to get an .dat file.
http://www.bmfbovespa.com.br/BancoTitulosBTC/PosicoesEmAberto.aspx?Idioma=pt-br
Via browser I just click on the second "buscar" and get the file automatically.
This is what I got so far:
program Project1;
{$APPTYPE CONSOLE}
uses
SysUtils, idHTTP, Classes, MSHTML, SHDocVw, Httpapp, System.Variants, idURI;
var
http: TIdHttp;
url: string;
code: integer;
parameters: TStringList;
Response: TStringStream;
Resultado: string;
MS: TMemoryStream;
begin
url := 'http://www.bmfbovespa.com.br/BancoTitulosBTC/PosicoesEmAberto.aspx?Idioma=pt-br';
http := TIdHTTP.Create(nil);
parameters := TStringList.Create;
Response := TStringStream.Create;
MS := TMemoryStream.Create;
try
try
Parameters.Add('__EVENTTARGET=');
Parameters.Add('__EVENTARGUMENT=');
Parameters.Add('__VIEWSTATE=/wEPDwUKMTI5NDMyNjQ4NA8WAh4QdGlwb1Bvc2ljYW9BdGl2bwspc1RpcG9Qb3NpY2Fv'+
'QXRpdm8sIEJvdmVzcGEuU2l0ZUJtZkJvdmVzcGEuQmFuY29UaXR1bG9zQlRDLldlYiwgVmVyc2lvbj0x'+
'LjAuMC4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGwAFgJmDw8WCB4Ebm9kZQUk'+
'YjViNDhlNjYtZTMwMC00NTMzLTgwYzktMzI2NmM5ZDY2ODBiHgpQYXJlbnROb2RlBUsvcHQtYnIvc2Vy'+
'dmljb3MvZW1wcmVzdGltby1kZS1hdGl2b3MvZW1wcmVzdGltby1kZS1hdGl2b3MuYXNweD9JZGlvbWE9'+
'cHQtYnIeC0N1cnJlbnROb2RlBTMvYmFuY290aXR1bG9zYnRjL3Bvc2ljb2VzZW1hYmVydG8uYXNweD9'+
'JZGlvbWE9cHQtYnIeCkN1cnJlbnRVcmwFMy9CYW5jb1RpdHVsb3NCVEMvUG9zaWNvZXNFbUFiZXJ0by5'+
'hc3B4P0lkaW9tYT1wdC1icmQWAgIDD2QWAgIBD2QWCgIBD2QWCAIDDxYCHgdWaXNpYmxlaGQCDA8WBh4'+
'Hb25jbGljawUQdGhpcy52YWx1ZSA9ICcnOx4Gb25ibHVyBTFpZih0aGlzLnZhbHVlID09ICcnKSB7dGh'+
'pcy52YWx1ZT0gbm9tZVRleHRCdXNjYTt9HgpvbmtleXByZXNzBSRyZXR1cm4ga2V5UHJlc3NQZXNxdWl'+
'zYSh0aGlzLGV2ZW50KTtkAg0PDxYCHg1PbkNsaWVudENsaWNrBRxyZXR1cm4gVmVyaWZpY2FyQ2FtcG9'+
'CdXNjYSgpZGQCDg8WAh4EVGV4dAWXMzxkaXYgaWQ9J21lbnUnPjx1bCBpZD0nbWVudUhvcml6Jz48bGk'+
'gaWQ9J2FibWZib3Zlc3BhJz48YSBocmVmPScjJyBjbGFzcz0nYWJtZmJvdmVzcGEnIGlkPSdsaW5rQWJ'+
'tZic+PGltZyBzcmM9Jy9zaGFyZWQvY3NzL2ltZy90cmFuc3AuZ2lmJyAvPjwvYT48dWwgb25tb3VzZW9'+
'2ZXI9ImxpbmtBYm1mLmNsYXNzTmFtZT0nYWJtZmJvdmVzcGFob3Zlcic7IiBvbm1vdXNlb3V0PSJsaW'+
'5rQWJtZi5jbGFzc05hbWU9J2FibWZib3Zlc3BhJzsiPjxsaT48YSBocmVmPScvcHQtYnIvaW50cm9zL2'+
'ludHJvLXNvYnJlLWEtYm9sc2EuYXNweCcgdGFyZ2V0PScnPk8gcXVlIGEgQm9sc2EgZmF6PC9hPjwvbG'+
'k+PGxpPjxhIGhyZWY9Jy9wdC1ici9hLWJtZmJvdmVzcGEvdmlzaXRhcy1hLWJvbHNhL3Zpc2l0YXMtYS'+
'1ib2xzYS5hc3B4JyB0YXJnZXQ9Jyc+VmlzaXRlIGEgQm9sc2E8L2E+PC9saT48bGk+PGEgaHJlZj0nL3'+
'B0LWJyL2EtYm1mYm92ZXNwYS91bmlkYWRlcy91bmlkYWRlcy5hc3B4JyB0YXJnZXQ9Jyc+Tm9zc2FzIH'+
'VuaWRhZGVzPC9hPjwvbGk+PGxpPjxhIGhyZWY9Jy9wdC1ici9hLWJtZmJvdmVzcGEvc3VzdGVudGFiaW'+
'xpZGFkZS5hc3B4JyB0YXJnZXQ9Jyc+U3VzdGVudGFiaWxpZGFkZTwvYT48L2xpPjxsaT48YSBocmVmPS'+
'dodHRwOi8vd3d3Lmluc3RpdHV0b2JtZmJvdmVzcGEub3JnLmJyL3B0LWJyL2hvbWUuYXNwJyB0YXJnZX'+
'Q9J19ibGFuayc+SW5zdGl0dXRvIEJNJkZCT1ZFU1BBPC9hPjwvbGk+PGxpPjxhIGhyZWY9Jy9wdC1ici'+
'9hLWJtZmJvdmVzcGEvdHJhYmFsaGUtY29ub3Njby90cmFiYWxoZS1jb25vc2NvLmFzcHgnIHRhcmdldD'+
'0nJz5UcmFiYWxoZSBuYSBCb2xzYTwvYT48L2xpPjxsaT48YSBocmVmPScvc2FsYS1kZS1pbXByZW5zYS'+
'9zYWxhaW1wcmVuc2EuYXNweD9pZGlvbWE9cHQtYnInIHRhcmdldD0nJz5TYWxhIGRlIEltcHJlbnNhPC'+
'9hPjwvbGk+PC91bD48L2xpPjxsaSBpZD0nbWVyY2Fkbyc+PGEgaHJlZj0nIycgY2xhc3M9J21lcmNhZ'+
'G9zJyBpZD0nbGlua01lcmNhZG8nPjxpbWcgc3JjPScvc2hhcmVkL2Nzcy9pbWcvdHJhbnNwLmdpZicgL'+
'z48L2E+PHVsIG9ubW91c2VvdmVyPSJsaW5rTWVyY2Fkby5jbGFzc05hbWU9J21lcmNhZG9zaG92ZXInO'+
'yIgb25tb3VzZW91dD0ibGlua01lcmNhZG8uY2xhc3NOYW1lPSdtZXJjYWRvcyc7Ij48bGk+PGEgaHJlZ'+
'j0nL3B0LWJyL21lcmNhZG9zL2Fjb2VzLmFzcHgnIHRhcmdldD0nJz5Bw6fDtWVzIDwvYT48L2xpPjxsa'+
'T48YSBocmVmPScvcHQtYnIvbWVyY2Fkb3MvbWVyY2Fkb3JpYXMtZS1mdXR1cm9zLmFzcHgnIHRhcmdld'+
'D0nJz5NZXJjYWRvcmlhcyBlIEZ1dHVyb3M8L2E+PC9saT48bGk+PGEgaHJlZj0nL3B0LWJyL2ludHJvcy'+
'9pbnRyby1jYW1iaW8uYXNweCcgdGFyZ2V0PScnPkPDom1iaW88L2E+PC9saT48bGk+PGEgaHJlZj0nL3B'+
'0LWJyL2ludHJvcy9pbnRyby1hdGl2b3MuYXNweCcgdGFyZ2V0PScnPkF0aXZvczwvYT48L2xpPjxsaT'+
'48YSBocmVmPScvcHQtYnIvaW50cm9zL2ludHJvLWZ1bmRvcy5hc3B4JyB0YXJnZXQ9Jyc+RnVuZG9zIC'+
'8gRVRGczwvYT48L2xpPjxsaT48YSBocmVmPScvUmVuZGEtRml4YS9SZW5kYUZpeGEuYXNweCcgdGFyZ2'+
'V0PScnPlJlbmRhIEZpeGE8L2E+PC9saT48bGk+PGEgaHJlZj0nL3B0LWJyL2ludHJvcy9pbnRyby1vd'+
'XRyb3MtdGl0dWxvcy5hc3B4JyB0YXJnZXQ9Jyc+T3V0cm9zIFTDrXR1bG9zPC9hPjwvbGk+PC91bD48L'+
'2xpPjxsaSBpZD0nY2VudHJvZGVpbmZvcm1hY29lcyc+PGEgaHJlZj0nIycgY2xhc3M9J2NlbnRyb2Rla'+
'W5mb3JtYWNvZXMnIGlkPSdsaW5rQ2VudHJvJz48aW1nIHNyYz0nL3NoYXJlZC9jc3MvaW1nL3RyYW5zc'+
'C5naWYnIC8+PC9hPjx1bCBvbm1vdXNlb3Zlcj0ibGlua0NlbnRyby5jbGFzc05hbWU9J2NlbnRyb2Rla'+
'W5mb3JtYWNvZXNob3Zlcic7IiBvbm1vdXNlb3V0PSJsaW5rQ2VudHJvLmNsYXNzTmFtZT0nY2VudHJvZ'+
'GVpbmZvcm1hY29lcyc7Ij48bGk+PGEgaHJlZj0nL3B0LWJyL2VkdWNhY2lvbmFsL2N1cnNvcy9jdXJzb'+
'3MuYXNweCcgdGFyZ2V0PScnPkN1cnNvczwvYT48L2xpPjxsaT48YSBocmVmPScvcHQtYnIvZWR1Y2Fja'+
'W9uYWwvc2ltdWxhZG9yZXMvc2ltdWxhZG9yZXMuYXNweCcgdGFyZ2V0PScnPlNpbXVsYWRvcmVzPC9hP'+
'jwvbGk+PGxpPjxhIGhyZWY9Jy9wdC1ici9lZHVjYWNpb25hbC9vcmNhbWVudG8tcGVzc29hbC5hc3B4J'+
'yB0YXJnZXQ9Jyc+T3LDp2FtZW50byBwZXNzb2FsPC9hPjwvbGk+PGxpPjxhIGhyZWY9Jy9zaGFyZWQva'+
'WZyYW1lLmFzcHg/aWRpb21hPXB0LWJyJnVybD1odHRwOi8vd3d3LmJtZmJvdmVzcGEuY29tLmJyL3B0L'+
'WJyL2VkdWNhY2lvbmFsL2VkdWNhci9Gb3JtSW5zY3JpY2FvUGFsZXN0cmFBY2Vzc29JbnN0LmFzcCc'+
'gdGFyZ2V0PScnPlBhbGVz');
Parameters.Add('__EVENTVALIDATION=/wEWMAKU1bzjDAKatY+lDgLz2ISXCALR05XvBgKW'+
'jICHCwKn1oTUCwLzhvO8BQLzht/hDALzhsuGBALzhteEAQLzhsOpCALzhq/ODwLzhpvzBgLzhqfxAwL'+
'zhpOWCwLzhv+6AgLzhuvfCQLzhvfdBgLzhuOCDgLzhs+nBQLzhrvMDALzhsfKCQLzhrNvAvOGn5QIAv'+
'OGi7kPAvOGl7cMAvOGg9wDAvOG74ALAvOG26UCAvOG56MPAvOG08gGAvOGv+0NAo3a1W8CjNrVbwKP2'+
'tVvAo7a1W8CidrVbwKI2tVvAova1W8CitrVbwKV2tVvApTa1W8CpqKfswMCrqXL7AcChv7z9w4C4/vLv'+
'gUCo/HJ+QsCtLPk8g6HXv3ITGyMQJG6GJIiOc0sGh7cpg==');
Parameters.Add('ctl00$ucTopo$btnBusca=Busca');
Parameters.Add('ctl00$menuBOVESPASecundario=');
Parameters.Add('ctl00$contentPlaceHolderConteudo$tabPosicaoEmAberto={"State":{},"TabState":{"ctl00_contentPlaceHolderConteudo_tabPosicaoEmAberto_tabAcoes":{"Selected":true}}}');
Parameters.Add('ctl00$contentPlaceHolderConteudo$acoes$txtConsultaData$txtConsultaData=2014-07-22');
Parameters.Add('ctl00$contentPlaceHolderConteudo$acoes$txtConsultaData$txtConsultaData$dateInput=2014-07-22-00-00-00');
Parameters.Add('ctl00_contentPlaceHolderConteudo_acoes_txtConsultaData_txtConsultaData_calendar_SD=[]');
Parameters.Add('ctl00_contentPlaceHolderConteudo_acoes_txtConsultaData_txtConsultaData_calendar_AD=[[2014,5,22],[2014,7,22],[2014,7,22]]');
Parameters.Add('ctl00$contentPlaceHolderConteudo$acoes$txtConsultaEmpresa=');
Parameters.Add('ctl00$contentPlaceHolderConteudo$acoes$txtConsultaDataDownload$txtConsultaDataDownload=2014-07-02');
Parameters.Add('ctl00$contentPlaceHolderConteudo$acoes$txtConsultaDataDownload$txtConsultaDataDownload$dateInput=2014-07-02-00-00-00');
Parameters.Add('ctl00_contentPlaceHolderConteudo_acoes_txtConsultaDataDownload_txtConsultaDataDownload_calendar_SD=[]');
Parameters.Add('ctl00_contentPlaceHolderConteudo_acoes_txtConsultaDataDownload_txtConsultaDataDownload_calendar_AD=[[2014,5,22],[2014,7,22],[2014,7,22]]');
Parameters.Add('ctl00$contentPlaceHolderConteudo$acoes$btnBuscarArquivos=Buscar');
Parameters.Add('ctl00$contentPlaceHolderConteudo$mpgPaginas_Selected=0');
Parameters.Add('cboAgentesCorretorasNome=#');
Parameters.Add('cboAgentesCorretorasCodigo=#');
http.HandleRedirects := true;
http.Request.UserAgent := 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36';
http.AllowCookies := True;
http.Request.ContentType := 'application/x-www-form-urlencoded';
http.Post(url,Parameters,Response);
MS.SaveToFile('C:\teste.dat');
except
WriteLn(IntToStr(code));
finally
http.Free();
FreeAndNil(parameters);
FreeAndNil(response);
ReadLn;
end;
end.
I don't know how to get the file or even if I sending the right parameters.
Any suggetions?
Thanks in advance!
EDIT:
I fount out that the ViewState and the EventValidation parameters change from time to time, so now I got this:
program Project1;
{$APPTYPE CONSOLE}
uses
SysUtils, idHTTP, Classes, MSHTML, SHDocVw, Httpapp, System.Variants, idURI;
function ExtractHiddenParameter(const ParameterName: string; const Request: string): string;
const
PrefixMask = 'input type="hidden" name="%s" id="%s" value="';
Suffix = '" />';
var
Prefix: string;
PrefixLength: Integer;
PrefixPosition: Integer;
SuffixPosition: Integer;
begin
Prefix := Format(PrefixMask, [ParameterName, ParameterName]);
PrefixPosition := Pos(Prefix, Request);
if PrefixPosition = 0 then
Result := ''
else
begin
PrefixLength := Length(Prefix);
Result := Copy(Request,
PrefixPosition + PrefixLength,
1 + Length(Request) - PrefixPosition - PrefixLength);
SuffixPosition := Pos(Suffix, Result);
if SuffixPosition = 0 then
Result := ''
else
Delete(Result, SuffixPosition, 1 + Length(Result) - SuffixPosition);
end;
end;
var
http: TIdHttp;
url: string;
getRequest: string;
code: integer;
parameters: TStringList;
Response: TStringStream;
Resultado: string;
sViewState: string;
sEventValidation: string;
MS: TMemoryStream;
begin
url := 'http://www.bmfbovespa.com.br/BancoTitulosBTC/PosicoesEmAberto.aspx?Idioma=pt-br';
http := TIdHTTP.Create(nil);
parameters := TStringList.Create;
Response := TStringStream.Create;
MS := TMemoryStream.Create;
try
try
http.ProxyParams.ProxyServer := 'proxy-scl';
http.ProxyParams.ProxyPort := 3128;
http.HandleRedirects := true;
http.Request.UserAgent := 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36';
http.AllowCookies := True;
getRequest := http.Get(url);
sViewState := ExtractHiddenParameter('__VIEWSTATE', getRequest);
sEventValidation := ExtractHiddenParameter('__EVENTVALIDATION', getRequest);
Parameters.Add('__EVENTTARGET=');
Parameters.Add('__EVENTARGUMENT=');
Parameters.Add('__VIEWSTATE='+sViewState);
Parameters.Add('__EVENTVALIDATION'+sEventValidation);
Parameters.Add('ctl00$ucTopo$btnBusca=Busca');
Parameters.Add('ctl00$menuBOVESPASecundario=');
Parameters.Add('ctl00$contentPlaceHolderConteudo$tabPosicaoEmAberto={"State":{},"TabState":{"ctl00_contentPlaceHolderConteudo_tabPosicaoEmAberto_tabAcoes":{"Selected":true}}}');
Parameters.Add('ctl00$contentPlaceHolderConteudo$acoes$txtConsultaData$txtConsultaData=2014-07-22');
Parameters.Add('ctl00$contentPlaceHolderConteudo$acoes$txtConsultaData$txtConsultaData$dateInput=2014-07-22-00-00-00');
Parameters.Add('ctl00_contentPlaceHolderConteudo_acoes_txtConsultaData_txtConsultaData_calendar_SD=[]');
Parameters.Add('ctl00_contentPlaceHolderConteudo_acoes_txtConsultaData_txtConsultaData_calendar_AD=[[2014,5,22],[2014,7,22],[2014,7,22]]');
Parameters.Add('ctl00$contentPlaceHolderConteudo$acoes$txtConsultaDataDownload$txtConsultaDataDownload=2014-07-02');
Parameters.Add('ctl00$contentPlaceHolderConteudo$acoes$txtConsultaDataDownload$txtConsultaDataDownload$dateInput=2014-07-02-00-00-00');
Parameters.Add('ctl00_contentPlaceHolderConteudo_acoes_txtConsultaDataDownload_txtConsultaDataDownload_calendar_SD=[]');
Parameters.Add('ctl00_contentPlaceHolderConteudo_acoes_txtConsultaDataDownload_txtConsultaDataDownload_calendar_AD=[[2014,5,22],[2014,7,22],[2014,7,22]]');
Parameters.Add('ctl00$contentPlaceHolderConteudo$acoes$btnBuscarArquivos=Buscar');
Parameters.Add('ctl00$contentPlaceHolderConteudo$mpgPaginas_Selected=0');
Parameters.Add('cboAgentesCorretorasNome=#');
Parameters.Add('cboAgentesCorretorasCodigo=#');
http.Head(url);
code := http.ResponseCode;
http.Request.ContentType := 'application/x-www-form-urlencoded';
http.Post(url,Parameters,Response);
MS.SaveToFile('C:\teste.dat');
except
on E: EIdHTTPProtocolException do
code := http.ResponseCode;
end;
WriteLn(IntToStr(code));
finally
http.Free();
FreeAndNil(parameters);
FreeAndNil(response);
ReadLn;
end;
end.
You were very close.
just change the line
MS.SaveToFile('C:\teste.dat');
into
Response.SaveToFile('C:\teste.dat');
you can remove the TMemoryStream from your code, it is not needed