#!/usr/bin/perl -w # Author: Kevin D. Clark (alumni.unh.edu!kdc) # Copyright 2005 Kevin D. Clark # This program makes generates content (for example, mp3 files) from # content sites (like NPR) which allow access to their content but not in # a format that is always convenient for their audience. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # (version 2) as published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # ########################################################################### # Usage: npr2mp3 # where is the name of the show that you want # # See %url_info for a list of shows. # ("npr5sum" is a good place to start) # What you need to run this program: # # A Linux box, Perl, a properly setup soundcard, RealPlayer, # sox, vsound, notlame ########################################################################### # "An explanation of our rejection of respondents' unprecedented attempt # to impose copyright liability upon the distributors of copying # equipment requires a quite detailed recitation of the findings of the # District Court. In summary, those findings reveal that the average # member of the public uses a VTR principally to record a program he # cannot view as it is being televised and then to watch it once at a # later time. This practice, known as "time-shifting," enlarges the # television viewing audience. For that reason, a significant amount of # television programming may be used in this manner without objection # from the owners of the copyrights on the programs. For the same # reason, even the two respondents in this case, who do assert # objections to time-shifting in this litigation, were unable to prove # that the practice has impaired the commercial value of their # copyrights or has created any likelihood of future harm. Given these # findings, there is no basis in the Copyright Act upon which # respondents can hold petitioners liable for distributing VTR's to the # general public. The Court of Appeals' holding that respondents are # entitled to enjoin the distribution of VTR's, to collect royalties on # the sale of such equipment, or to obtain other relief, if affirmed, # would enlarge the scope of respondents' statutory monopolies to # encompass control over an article of commerce that is not the subject # of copyright protection. Such an expansion of the copyright privilege # is beyond the limits of the grants authorized by Congress." # # Supreme Court Justice John Paul Stevens, writing for the majority, # SONY CORP. v. UNIVERSAL CITY STUDIOS, INC., 464 U.S. 417 (1984) # Digital files cannot be made uncopyable, any more than water # can be made not wet. # --Bruce Schneier # Kevin's comment: if you use this script and haven't donated generously # to your local NPR station, bad karma is coming your way. ########################################################################### # Version history: # # 0.1 - 11-feb-2005 # Initial version. # # 0.2 - 22-feb-2005 # Changed to no longer depend on C program built on on the fly; this # program now uses "vsound" to capture audio. Thanks to Travis for # telling me about vsound! An added benefit of using vsound is that # you can encode content without it being directed to your speakers # (so, you can listen to one program while you encode another). # I am told that invoking vsound with "--dspout" turns this on/off. # Another benefit of using vsound is that this eliminates the # weirdness seen on platforms like Fedora Core 2 (very slow audio). # # # # # ########################################################################### # TODO: # The code could always be cleaned up a bit more. # # Add documentation. # # Add option to make RealAudio silent, just save to the raw file # and generate the mp3 file. This would be useful if you want to listen # to something else while you're making an mp3. (FIXED IN 0.2) # # Invoke sox with the earwax option. (needs some work with vsound) # # Investigate weirdness on FC2 (FIXED IN 0.2) # # Make sampling rate flexible (FIXED) # # Make this work on other interesting platforms, like FreeBSD and MacOSX. # # Investigate what it would take to run this on a system without # a soundcard. (for Travis) # # Investigate what it would take to run this on a system # without X. (for Travis) # # Make the whole program even more flexible so that if today's show isn't # available when you ask for it, yesterday's is retrieved instead. # # Dump the MP3 files to a more logical or configurable location. # # This might be getting to the size where we could make some of this # more OO, as well as split out the audio-saving functionality. # # Put a failsafe in the code that catches the situation where close() # isn't called -- at that point the code that encodes the sampling speed # in the filename isn't called, which means that a stream that we might have # been downloading for a while could get nuked. # # Perhaps we could enode the mp3's at a lower sample rate, to make the # resulting files smaller? # # Come up with a general way to pass arguments to vsound and notlame. use strict; use LWP::UserAgent; use HTML::TokeParser; use POSIX (qw/dup2/); use Getopt::Std; use Date::Calc qw(Today Day_of_Week Add_Delta_Days Date_to_Time Day_of_Week_to_Text Date_to_Text Localtime); # Q: How do I install the Date::Calc module on my computer? # A: One way would be to, as root, type "perl -MCPAN -e shell" and then # at the prompt type "install Data::Calc". ########################################################################### # # GLOBAL DATA STRUCTURES # my $DEBUG = 1; my $show = "npr5sum"; # default show we are interested in # information about various websites my $day_of_week_re = qr(monday|tuesday|wednesday|thursday|friday|saturday|sunday)i; my %url_info = ( cartalktest => { # Where is the web page that contains the RealAudio link? # This can either be a sting that contains the URL or a ref to a # function that returns a string that contains the URL. url => "http://cartalk.com/Radio/Show/online.html", # what RealAudio link are we looking for? # This is a regular expression. ralinkre => qr(^\s*Segment\s+1\s*:\s*$)i, # matches "Segments 1" # Is there some function that we want to call to transmogrify # the RealAudio link? If so, list one here. # ralinktrans => \&some_code_ref, }, cartalk => { url => "http://cartalk.com/Radio/Show/online.html", ralinkre => qr(Segments\s+1\s*-\s*\d+)i, # matches "Segments 1 - 10" }, waitwait => { url => "http://www.npr.org/programs/waitwait/", ralinkre => qr(Listen to the show)i, # Is there some function that we want to call to transmogrify # the RealAudio link? If so, list one here. ralinktrans => \&npr_js_link_trans, }, atc => { # all things considered url => "http://www.npr.org/programs/atc/", ralinkre => qr(Listen to ${day_of_week_re}'s show)i, # Is there some function that we want to call to transmogrify # the RealAudio link? If so, list one here. ralinktrans => \&npr_js_link_trans, }, morning => { # morning edition url => "http://www.npr.org/programs/morning/", ralinkre => qr(Listen to ${day_of_week_re}\'s show)i, # Is there some function that we want to call to transmogrify # the RealAudio link? If so, list one here. ralinktrans => \&npr_js_link_trans, }, npr5sum => { # Handy 5 minute summary of news url => "http://www.npr.org/", ralinkre => qr(Hourly Newscast)i, # Is there some function that we want to call to transmogrify # the RealAudio link? If so, list one here. ralinktrans => \&npr_js_link_trans, }, fresh_air => { url => "http://freshair.npr.org/", ralinkre => qr(Listen to ${day_of_week_re}\'s show)i, # Is there some function that we want to call to transmogrify # the RealAudio link? If so, list one here. ralinktrans => \&npr_js_link_trans, }, wesat => { # weekend edition saturday url => "http://www.npr.org/programs/wesat/", ralinkre => qr(Listen to ${day_of_week_re}\'s show)i, # Is there some function that we want to call to transmogrify # the RealAudio link? If so, list one here. ralinktrans => \&npr_js_link_trans, }, wesun => { # weekend edition sunday url => "http://www.npr.org/programs/wesat/", ralinkre => qr(Listen to ${day_of_week_re}\'s show)i, # Is there some function that we want to call to transmogrify # the RealAudio link? If so, list one here. ralinktrans => \&npr_js_link_trans, }, totn => { # talk of the nation url => "http://www.npr.org/programs/totn/", ralinkre => qr(Listen to ${day_of_week_re}\'s show)i, # Is there some function that we want to call to transmogrify # the RealAudio link? If so, list one here. ralinktrans => \&npr_js_link_trans, }, day => { url => "http://www.npr.org/programs/day/", ralinkre => qr(Listen to ${day_of_week_re}\'s show)i, # Is there some function that we want to call to transmogrify # the RealAudio link? If so, list one here. ralinktrans => \&npr_js_link_trans, }, marketplace => { url => "http://www.marketplace.org/", ralinkre => qr(Listen to P.M. show)i, }, herenow => { # here and now url => sub { # we need a url like this: # http://www.here-now.org/shows/2005/02/20050203.asp my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime; $year += 1900; sprintf("http://www.here-now.org/shows/%d/%.2d/%d%.2d%.2d.asp", $year, $mon+1, $year, $mon+1, $mday); }, ralinkre => qr(Listen to the show)i, }, prairie => { url => sub { # we want a url like this # http://prairiehome.publicradio.org/programs/2005/02/05/ # # Let's just assume that we're always looking to listen to last # Saturday's show. If you are running this script on a Saturday # night and expecting to get today's show, you really need to # get a life. my $saturday_dow = 6; # 6 = Saturday my @today = Today(); my $current_dow = Day_of_Week(@today); my $delta = (($current_dow == 7) ? 1 : ($current_dow +1)); my @prev_saturday = (Add_Delta_Days(@today, (-1 * $delta)),0,0,0); my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(Date_to_Time(@prev_saturday)); sprintf("http://prairiehome.publicradio.org/programs/%d/%.2d/%.2d/", $year + 1900, $mon+1, $mday+1); }, ralinkre => qr(Listen to the whole show)i, ralinktrans => sub { # this needs to be fixed, because technically what I am doing # here isn't correct. my ($relurl) = @_; my $result = "http://prairiehome.publicradio.org" . $relurl; if ($DEBUG) { print "Transforming link from: $relurl\n"; print " to: $result\n"; } $result; }, }, ); my %opts = ( # kindof like perl/sed's -n flag # if -n is specified, then audio won't be sent to the dsp device. n => 0, ); my $tempdir="/tmp/npr2mp3.$$"; my $ratmpfile = "$tempdir/ratmp"; ########################################################################### ########################################################################### ########################################################################### # given a $html document, this routine extracts the $nth link # identified with $linkre sub extract_link($$$) { my($html, $linkre, $nth) = @_; my $p = HTML::TokeParser->new(\$html); while (my $token = $p->get_tag("a")) { my $url = $token->[1]{href} || "-"; my $text = $p->get_trimmed_text("/a"); if ($text =~ /$url_info{$show}{ralinkre}/ && (--$nth <= 0)) { return $url; } } return undef; } ########################################################################### sub get_show_url { my ($u) = @_; my $result = undef; if (! ref($u)) { $result = $u; } elsif (ref($u) eq "CODE") { $result = &$u; } # print "get_show_url returns :$result:\n"; $result; } ########################################################################### sub kill_kill_kill(@) { # One of these days... # -- Pink Floyd foreach my $pid (@_) { kill(0, $pid) && kill(&POSIX::SIGHUP, $pid) && sleep(1); kill(0, $pid) && kill(&POSIX::TERM, $pid) && sleep(1); kill(0, $pid) && kill(9, $pid); } } ########################################################################## sub start_ra() { # kill off any RealPlayer application that is already running # # this is kindof gross system("kill `ps -elf | egrep '[r]ealplay' | awk '{print \$4}'` >/dev/null 2>&1"); start_in_background("vsound -k -v -f $tempdir/$show.wav realplay --quit $ratmpfile"); } ########################################################################## sub start_in_background($) { my ($cmd) = @_; my ($pid, $fd); my @cmd = split(/[ \t\n]/, $cmd); # no point in using $IFS if (!defined($pid = fork())) { die "cannot fork: $!"; } elsif ($pid == 0) { # child $fd = POSIX::open("/dev/null", &POSIX::O_RDONLY) || die "Can't open stdin /dev/null: $!\n"; dup2($fd, 0); $fd = POSIX::open("/dev/null", &POSIX::O_WRONLY | &POSIX::O_CREAT) || die "Can't open stdout /dev/null: $!\n"; dup2 $fd, 1; # leave stderr alone in case Something Bad happens exec(@cmd); die "can't exec '$cmd': $!"; } else { # parent } return $pid; } ########################################################################### sub npr_js_link_trans($) { my ($nprjslink) = @_; # we want to transform something like this: # javascript:getStaticMedia('/waitwait/20050108_waitwait','RM,WM')" # into: # http://www.npr.org/dmg/dmg.php?mediaURL=/waitwait/20050108_waitwait&mediaType=RM # also, change: # javascript:getMedia('ATC','13-Jan-2005','all','WM,RM'); # getMedia(prgCode, showDate, segNum, mediaPreference) # to # javascript:getMedia('ATC','13-Jan-2005','all','WM,RM'); $nprjslink =~ s{.*getStaticMedia\('(.*?)'.*} {http://www.npr.org/dmg/dmg.php?mediaURL=$1&mediaType=RM}x; $nprjslink =~ s{.*getMedia\('(.*?)'\s*,\s* # "prgCode" '(.*?)'\s*,\s* # "showDate" '(.*?)'\s*,\s* # "all" .*} {http://www.npr.org/dmg/dmg.php?prgCode=$1&showDate=$2&segNum=&mediaPref=RM&getUnderwriting=1}x; # /dmg/dmg.php?prgCode=ATC&showDate=13-Jan-2005&segNum=&mediaPref=RM&getUnderwriting=1 # javascript:getMedia('ATC','13-Jan-2005','all','WM,RM'); # prgCode, showDate, segNum, mediaPreference # "http://www.npr.org/dmg/dmg.php?prgCode=" + prgCode + "&showDate=" + showDate + "&segNum=" + segNum + "&mediaPref=RM", "", "") # goNewURL("http://www.npr.org/dmg/dmg.php?prgCode=" + prgCode + "&showDate=" + showDate + "&segNum=" + segNum + "&mediaPref=RM", "", ""); # http://www.npr.org/dmg/dmg.php?mediaURL=$1&mediaType=RM}x; # also, change: # # javascript:getNewsCast(); # # to: # # http://www.npr.org/dmg/dmg.php?mediaURL=http://www.npr.org/dmg/dmg.php?getNewsCast=true&mediaType=RM $nprjslink =~ s{.*getNewsCast\(\).*} {http://www.npr.org/dmg/dmg.php?getNewsCast=true&NPRMediaPref=RM}; $nprjslink; } # # MAIN ROUTINE # getopts('n', \%opts); $show = shift || "cartalk"; die "Unknown show: $show\n" if (! defined($url_info{$show})); print "We're getting the audio for this show: $show\n"; my $show_url = get_show_url($url_info{$show}{url}); my $ua = LWP::UserAgent->new; my $ret = $ua->get($show_url); mkdir("$tempdir") || die "Unable to mkdir $tempdir: $!\n"; # # Get the show's main HTML page # die "Unable to get $show url: ".$show_url. ": ".$ret->status_line."\n" if (! $ret->is_success); my $ralink = extract_link($ret->content, $url_info{$show}{ralink}, 1); { no warnings; die "Unable to find link for $show\n" if ($ralink eq undef); } print "Link is '$ralink'\n" if ($DEBUG); # possibly transform the link if (defined($url_info{$show}{ralinktrans})) { $ralink = &{$url_info{$show}{ralinktrans}}($ralink); print "Link is changed to '$ralink'\n" if ($DEBUG); } # # Get the RealAudio thingie # my $rareq = HTTP::Request->new('GET', $ralink); $ret = $ua->request($rareq, $ratmpfile); die "Unable to get RealAudio url: ".$ralink. ": ".$ret->status_line."\n" if (! $ret->is_success); # # Run RealAudio application, capturing audio in the background # my $rapid = start_ra(); print "Running realplay....\n"; waitpid($rapid, 0); print "...done\n"; # gross -- I need to research this more print "Converting streams to mp3 with notlame'...\n"; my $notlamecmd = "notlame $tempdir/$show.wav $tempdir/$show.mp3"; system("echo $notlamecmd"); system( $notlamecmd); # || die "Problem running notlame!\n"; if (!$DEBUG) { system("rm -f $tempdir/*.wav $tempdir/ratmp"); } print "Done. Final file is $tempdir/$show.mp3\n";