getHeadkineという関数に、対象となる会社のticker、その会社の属するmarket(NASDAQなど)、検索対象となる開始日と終了日をstartとendというパラメータで指定すると、keyが日付、valueがヘッドラインの辞書を返す。
from __future__ import unicode_literals
import urllib2
import urllib
import re
from datetime import datetime,timedelta
import nltk
def getHeadline(ticker,market,start,end):
query = urllib2.quote(ticker+':'+market)
base = 'https://www.google.com/finance/company_news?q='
url = base + query + '&startdate='+ start +'&enddate=' + end + '&start=0&num=10000'
print url
html = urllib.urlopen(url).read()
raw = nltk.clean_html(html)
raw = unicode(raw,errors='ignore')
data = raw.split('\n')
data = data[60:-1]
while u' ' in data: data.remove(u' ')
monthDict = {'Jan':'01','Feb':'02','Mar':'03','Apr':'04','May':'05','Jun':'06','Jul':'07','Aug':'08','Sep':'09','Oct':'10','Nov':'11','Dec':'12'}
news={}
for i in range(0,len(data),1):
t = re.match(r'[\s\w0-9/\.,:;\(\)]+\-\s([0-9\w\s]+)\sago\s*',data[i])
o = re.match(r'[\s\w0-9/\.,:;\(\)]+\-\s(\w+)\s([0-9]+),\s(201[0-3])\s*',data[i])
if t:
hours = int(t.group(1).split(' ')[0])
delta = timedelta(hours=hours)
gmt = timedelta(hours=7)
d = datetime.today() - delta + gmt
year = str(d.year)
if d.month <10:
month = '0'+str(d.month)
else:
month = str(d.month)
if d.day < 10:
day = '0'+str(d.day)
else:
day = str(d.day)
key = year+'_'+month+'_'+day
if key in news.keys():
news[key].append(data[i-1])
else:
news[key]=[]
news[key].append(data[i-1])
if o:
#Month
m = o.group(1)
month = monthDict[m]
day = o.group(2)
year = o.group(3)
key = year+'_'+month+'_'+day
if key in news.keys():
news[key].append(data[i-1])
else:
news[key]=[]
news[key].append(data[i-1])
return news
#Example : getHeadline
#Output is a dictionary like this: { date : [headline1, headline2,.....], date:[headline3],......}
#the keys (time) is not ordered.Also, time is GMT (I checked the publish time in RSS xml).
#The function requires 'ticker', 'market name', 'start date'(YYYY-MM-DD) and 'end date' (YYYY-MM-DD)
print getHeadline('AAPL','NASDAQ','2012-09-01','2013-09-15')