#! ruby -Ke
# tv_yahoo.rb (coding: euc-jp) 2010/02/19
# 東京の AM, FM, TV, BS1, BS2 の番組表を各々12時間分取得
# 地区と時間数は変更可能。Yahoo_area, tv_yahooを参照。
# Copyright (C) T. Yoshiizumi, 2010 All rights reserved.
require 'rubygems'
require 'hpricot'
require 'nkf'
$NKF_C = $KCODE[0,1].downcase
$NKF_C = 'w' if $NKF_C == 'u'
$OUT_C = 's' # 出力をSJISにする指定
# Msxml2.xmlhttpによってhttpからデータを取得するためのクラス
class Mshttp
def initialize(os_val = :COMMON)
@os_val = os_val
@MSHTTP = nil
if @os_val == :WIN32
require 'win32ole'
begin
@MSHTTP = WIN32OLE.new('Msxml2.XMLHTTP')
rescue
@MSHTTP = nil
end
unless defined? @MSHTTP
begin
@MSHTTP = WIN32OLE.new('MSXML.XMLHTTPRequest')
rescue
@MSHTTP = nil
end
end
unless defined? @MSHTTP
begin
@MSHTTP = WIN32OLE.new('Microsoft.XMLHTTP')
rescue
@MSHTTP = nil
end
end
@sec_limit = 20 # タイムアウト秒数
else
require 'open-uri'
end
@body = ''
end
attr_accessor :sec_limit
attr_reader :body
# htmlドキュメントを取得。失敗時は false または '' を返す
def web_open(url)
if @os_val == :WIN32
unless @MSHTTP
return false
end
if url =~ /^http/i
@MSHTTP.Open "GET", url, false
sec_count = 0
while @MSHTTP.readyState != 1
sleep 1
sec_count += 1
break if sec_count > @sec_limit
end
if sec_count > @sec_limit
@MSHTTP.abort
return false
end
begin
@MSHTTP.Send
rescue
@MSHTTP.abort
return false
end
sec_count = 0
while @MSHTTP.readyState != 4
sleep 1
sec_count += 1
break if sec_count > @sec_limit
end
if sec_count > @sec_limit
@MSHTTP.abort
return false
end
if @MSHTTP.status == 200
@body = @MSHTTP.responseBody.pack("c*")
else
@body = ''
end
@MSHTTP.abort
end
else
begin
@body = open(url).read
rescue
return false
end
end
return @body
end
end
# html中の表(table)を扱うメソッド
# require 'rubygems'
# require 'hpricot'
module Html_tbl
# table部分をrubyの配列に変換。セル結合に対応
# data_type(配列の各要素のタイプ) :TEXT | :HTML
# :HTMLの場合は
…… | が配列の各要素にセットされる
def table_to_array(tbl_str, data_type=:TEXT)
aa = Array.new
tbl = Hpricot(tbl_str)
row_max = tbl.search("tr").length
if row_max < 1
return aa
end
for i in 0...row_max
aa[i] = Array.new
end
row_n = 0
tbl.search("tr").each {|e|
e.search("td").each {|e2|
data = data_type == :TEXT ? e2.inner_text : e2.to_html
col_n = aa[row_n].index(nil) # data格納列番号を得る
if col_n == nil
col_n = aa[row_n].size
end
aa[row_n][col_n] = data
cspan = 1 # 列結合への対応
if e2.attributes['colspan'] =~ /(\d+)/ # 列結合数検出
cspan = $1.to_i
end
if cspan > 1
for c in (col_n+1)...(col_n+cspan)
aa[row_n][c] = "" if aa[row_n][c] == nil
end
end
rspan = 1 # 行結合への対応
if e2.attributes['rowspan'] =~ /(\d+)/ # 行結合数検出
rspan = $1.to_i
end
if rspan > 1
for r in (row_n+1)...(row_n+rspan)
for c in col_n...(col_n+cspan)
aa[r][c] = "" if aa[r][c] == nil
end
end
end
}
row_n += 1
}
return aa
end
# おまけ:縦・横を逆転した2次元配列に変換。失敗時は false を返す
def table_turn(aa)
unless aa.class == Array
return false
end
i_size = aa.size-1
j_size = 0
for i in 0..i_size
next unless aa[i].class == Array
if (aa[i].size-1) > j_size
j_size = aa[i].size-1
end
end
if j_size == 0
return false
end
r = []
for j in 0..j_size
w = []
for i in 0..i_size
if aa[i][j] != nil
w << aa[i][j]
else
w << ''
end
end
r << w
end
return r
end
end
# 番組表(配列)をテキスト(文字列)に変換
def prog_to_text(ary)
str = ''
for page_ary in ary
title, url = page_ary[0]
str += title + "\n"
str += url + "\n"
for i in 1...page_ary.size
station_ary = page_ary[i]
station_name = station_ary[0]
str += station_name + "\n"
for j in 1...station_ary.size
time_contents = station_ary[j]
str += "#{time_contents[0]}\t#{time_contents[1]}\n"
end
str += "\n"
end
end
return str
end
# Yahooの番組表を得るための各種定義
# tv.yahooの都道府県名リスト
Yahoo_area = {
"北海道"=>"hokk",
"青森"=>"aomori",
"岩手"=>"iwate",
"秋田"=>"akita",
"山形"=>"yamagata",
"宮城"=>"miya",
"福島"=>"fuku",
"栃木"=>"tochigi",
"群馬"=>"gunma",
"茨城"=>"ibaraki",
"東京"=>"tokyo",
"神奈川"=>"kanagawa",
"埼玉"=>"saitama",
"千葉"=>"chiba",
"山梨"=>"yamanasi",
"新潟"=>"niigata",
"長野"=>"nagano",
"石川"=>"ishikawa",
"富山"=>"toyama",
"福井"=>"fukui",
"静岡"=>"sizu",
"愛知"=>"aichi",
"岐阜"=>"gifu",
"三重"=>"mie",
"和歌山"=>"wakayama",
"奈良"=>"nara",
"滋賀"=>"siga",
"京都"=>"kyoto",
"大阪"=>"osaka",
"兵庫"=>"hyougo",
"徳島"=>"tokusima",
"愛媛"=>"ehime",
"高知"=>"kouchi",
"香川"=>"kagawa",
"岡山"=>"okayama",
"広島"=>"hirosima",
"鳥取"=>"tottori",
"島根"=>"shimane",
"山口"=>"yamagchi",
"福岡"=>"fukuoka",
"佐賀"=>"saga",
"熊本"=>"kumamoto",
"長崎"=>"nagasaki",
"大分"=>"oita",
"宮崎"=>"miyazaki",
"鹿児島"=>"kagosima",
"沖縄"=>"nawa"}
# yahooテレビ・ラジオのカテゴリー
Yahoo_category = ['AM', 'FM', 'TV', 'BS1', 'BS2']
include Html_tbl
# メソッド定義
# 引数 span:1単位の時間数, page_max:何単位(webで何頁)取得するかの数
# 戻り値は下の形式の配列
# [[タイトル, uri], [放送局名, [時刻, 番組内容], ...],
# [放送局名, [時刻, 番組内容], ...], ...]
def tv_yahoo(key='東京', category='AM', span=6, page_max=1)
res_ary = Array.new
separate = "\t" # 時刻と番組内容の区切り文字
category = category.upcase
unless Yahoo_category.include?(category)
return res_ary
end
area = Yahoo_area[key]
if area == nil
return res_ary
end
current_sec = Time.now.to_i
starttime = Time.at(current_sec).strftime("%H")
today = Time.at(current_sec).strftime("%Y%m%d")
url = "http://tv.yahoo.co.jp/listings/" +
"?area=#{area}&starttime=#{starttime}&detail=1&gcode=0&cal=0" +
"&date=#{today}&span=#{span}&type=normal&category=#{category}"
mshttp = Mshttp.new # Mshttp.new(:WIN32) ならWindows専用になる
for count in 1..page_max
body = mshttp.web_open(url)
if body == false or body == ''
break
end
body = NKF.nkf("-m0 -#{$NKF_C}", body.gsub(/\r\n/, "\n"))
body = body.gsub(/\ /, " ")
doc = Hpricot(body)
title = nil
if (e = doc.search("title").first) != nil
title = e.inner_text
end
tbl_str = nil
doc.search("table").each {|e|
if e.attributes['class'] =~ /channel/i
tbl_str = e.inner_html
break
end
}
if tbl_str == nil
break
end
aa = table_to_array(tbl_str)
for i in 0..aa.size
next if aa[i] == nil
for j in 0..aa[i].size
next if aa[i][j] == nil
w = aa[i][j].gsub(/\s+/, " ")
w = w.gsub(/ +/, " ")
w = w.sub(/^ +/, "").sub(/ +$/, "")
aa[i][j] = w
end
end
raa = table_turn(aa)
page_ary = Array.new
raa.each {|row|
row.delete(nil)
row.delete("")
next if row.size < 1
if row[0] == row.last
row.pop
end
station_str = ''
row.each {|col|
next if col =~ /番組情報がありません/ or col =~ /^\d+$/
col = col.sub(/^(\d+\:\d+) +/, "\\1#{separate}")
col = col.gsub(/[◇▽](\d+)[::・](\d+)/, "\n\\1:\\2#{separate}")
col = col.gsub(/[◇▽](\d\d)([^0-9])/, "\n\\1#{separate}\\2")
station_str = station_str + col + "\n"
}
unless station_str == ''
station_str.gsub!(/[◇▽]+/, " | ")
station_ary = station_str.split(/\n/)
for i in 0...station_ary.size
if station_ary[i] =~ /\t/
station_ary[i] = station_ary[i].split(/\t/)
end
end
page_ary << station_ary if station_ary.size > 1
end
}
if page_ary.size > 0
w = title.to_s.sub(/ +- +Yahoo!テレビ\.Gガイド.+$/, "")
page_ary.unshift(["#{w}", url])
res_ary << page_ary
end
current_sec += (span*60*60)
starttime = Time.at(current_sec).strftime("%H")
today = Time.at(current_sec).strftime("%Y%m%d")
url = "http://tv.yahoo.co.jp/listings/" +
"?area=#{area}&starttime=#{starttime}&detail=1&gcode=0&cal=0" +
"&date=#{today}&span=#{span}&type=normal&category=#{category}"
end
return res_ary
end
## main
# 東京の AM, FM, TV, BS1, BS2 の番組表を各々12時間分取得
Yahoo_category.each {|cat|
ary = tv_yahoo('東京', cat, 12, 1)
File.open("#{cat.downcase}.txt", "w") {|ff|
text = prog_to_text(ary)
text = NKF.nkf("-m0 -#{$OUT_C}", text)
ff.print text
}
}