#! /usr/bin/perl -w use Socket; use strict; my $word = $ARGV[0] or die "Usage: mw word\n"; my $host = "www.m-w.com"; my $port = 80; my $socketaddr; my $content = "jump=$word"; # This seems to work with white space in word my $wholePage = ""; my $form = ""; my $buf = ""; my @listvalue = (); my @option = (); my $selections = 0; my $finished_selections = 0; my $reentry = 0; my $count = 0; while ($content) { openSock(); post ($content); $wholePage = ""; while ( <SOCK> ) { $wholePage .= $_; } close SOCK; print "****************************************"; # pick out the form, /s matches newline with '.' /g is greedy # /m matches internal newline if ( $wholePage =~ /The word you've entered isn't in the dictionary.*(<PRE>.*<\/PRE>)/gs ) { print "\nCan't find $word, but here are some suggestions\n"; $form = $1; } elsif ($wholePage =~ /(<form .*<\/form>)/gs ) { $form = $1; $form =~ s/Get the.*Top 10 Most Popular Sites for.*//s; } else { die "Don't know what to do with $word\n"; } # this is heavy duty kludge, geared toward www.m-w.com, needs maintenance # find out if the form has a selection of options $selections = 0; if ($form =~ /<select style=.*>/) { $selections = 1; @option = ($form =~ /^<option.*>(.*)$/mg); @listvalue = ($form =~ /name=list value="(.*)">/g); } # convert html into something more readable $form =~ s/<br>/\n/g; # change html linebreak to newline $form =~ s/<option.*?\n//mg;# delete the selection list, to be shown later $form =~ s/<[^>]*>//g; # delete all the other html tags $form =~ s/\n+/\n/g; # delete multiple newlines # visualize non-alpha-numerical ANSI characters convert_char (); print $form; print "\n"; # prompt the user for further actions: look up another word or stop here $content = ""; if ($selections | $reentry) { do { print "----------------------------------------\n"; print "Here are the related words:\n"; for (my $i=0, my $j = 1;$i<@option;$i++, $j++){ print "$j: $option[$i]\n"; } print "\nEnter a number to select from the list, or enter . to quit\n"; $buf = <STDIN>; chomp $buf; if ($buf eq '.') { $content = ""; $finished_selections = 1; } elsif ($buf !~ /\d/ or $buf > @option or $buf <= 0) { print "What did you just do? Enter . to end the session\n"; $content = ""; $finished_selections = 0; } else { $buf -= 1; $content = "hdwd=$word&book=Dictionary&jump="; $content .= urlencode ($option[$buf]); $content .= "&list="; $content .= urlencode ($listvalue[0]); $reentry = 1; $finished_selections = 1; } } until ( $finished_selections ) } # end of if selections } # end of while content ########### # subroutine: open a socket at SOCK ########### sub openSock { $socketaddr= sockaddr_in $port, inet_aton $host or die "Bad hostname\n"; socket SOCK, PF_INET, SOCK_STREAM, getprotobyname('tcp') or die "Bad socket\n"; connect SOCK, $socketaddr or die "Bad connection\n"; select((select(SOCK), $| = 1)[0]); } ########### # subroutine: urlencode a string ########### sub urlencode { my $ask = shift @_; my @a2 = unpack "C*", $ask; my $s2 = ""; while (@a2) { $s2 .= sprintf "%%%X", shift @a2; } return $s2; } ########### # subroutine: send post request to target web site ########### sub post { my $content = shift @_; my $crlf = "\015\012"; # \cM\cJ also works print SOCK "POST http://www.m-w.com/cgi-bin/dictionary HTTP/1.0$crlf"; print SOCK "Content-type: application/x-www-form-urlencoded$crlf"; my $contentLength = length $content; print SOCK "Content-length: $contentLength$crlf"; print SOCK "$crlf"; print SOCK "$content"; } ########### # subroutine: make those non-English characters visible ########### # uses the global variable $form # the character codes are iso8859-1 sub convert_char { $form =~ s/"/chr(34)/eg; $form =~ s/&/chr(38)/eg; $form =~ s/</chr(60)/eg; $form =~ s/>/chr(62)/eg; $form =~ s/ /chr(160)/eg; $form =~ s/¡/chr(161)/eg; $form =~ s/¢/chr(162)/eg; $form =~ s/£/chr(163)/eg; $form =~ s/¤/chr(164)/eg; $form =~ s/¥/chr(165)/eg; $form =~ s/¦/chr(166)/eg; $form =~ s/§/chr(167)/eg; $form =~ s/¨/chr(168)/eg; $form =~ s/©/chr(169)/eg; $form =~ s/ª/chr(170)/eg; $form =~ s/«/chr(171)/eg; $form =~ s/¬/chr(172)/eg; $form =~ s/­/chr(173)/eg; $form =~ s/®/chr(174)/eg; $form =~ s/¯/chr(175)/eg; $form =~ s/°/chr(176)/eg; $form =~ s/±/chr(177)/eg; $form =~ s/²/chr(178)/eg; $form =~ s/³/chr(179)/eg; $form =~ s/´/chr(180)/eg; $form =~ s/µ/chr(181)/eg; $form =~ s/¶/chr(182)/eg; $form =~ s/·/chr(183)/eg; $form =~ s/¸/chr(184)/eg; $form =~ s/¹/chr(185)/eg; $form =~ s/º/chr(186)/eg; $form =~ s/»/chr(187)/eg; $form =~ s/¼/chr(188)/eg; $form =~ s/½/chr(189)/eg; $form =~ s/¾/chr(190)/eg; $form =~ s/¿/chr(191)/eg; $form =~ s/À/chr(192)/eg; $form =~ s/Á/chr(193)/eg; $form =~ s/Â/chr(194)/eg; $form =~ s/Ã/chr(195)/eg; $form =~ s/Ä/chr(196)/eg; $form =~ s/Å/chr(197)/eg; $form =~ s/Æ/chr(198)/eg; $form =~ s/Ç/chr(199)/eg; $form =~ s/È/chr(200)/eg; $form =~ s/É/chr(201)/eg; $form =~ s/Ê/chr(202)/eg; $form =~ s/Ë/chr(203)/eg; $form =~ s/Ì/chr(204)/eg; $form =~ s/Í/chr(205)/eg; $form =~ s/Î/chr(206)/eg; $form =~ s/Ï/chr(207)/eg; $form =~ s/Ð/chr(208)/eg; $form =~ s/Ñ/chr(209)/eg; $form =~ s/Ò/chr(210)/eg; $form =~ s/Ó/chr(211)/eg; $form =~ s/Ô/chr(212)/eg; $form =~ s/Õ/chr(213)/eg; $form =~ s/Ö/chr(214)/eg; $form =~ s/×/chr(215)/eg; $form =~ s/Ø/chr(216)/eg; $form =~ s/Ù/chr(217)/eg; $form =~ s/Ú/chr(218)/eg; $form =~ s/Û/chr(219)/eg; $form =~ s/Ü/chr(220)/eg; $form =~ s/Ý/chr(221)/eg; $form =~ s/Þ/chr(222)/eg; $form =~ s/ß/chr(223)/eg; $form =~ s/à/chr(224)/eg; $form =~ s/á/chr(225)/eg; $form =~ s/â/chr(226)/eg; $form =~ s/ã/chr(227)/eg; $form =~ s/ä/chr(228)/eg; $form =~ s/å/chr(229)/eg; $form =~ s/æ/chr(230)/eg; $form =~ s/ç/chr(231)/eg; $form =~ s/è/chr(232)/eg; $form =~ s/é/chr(233)/eg; $form =~ s/ê/chr(234)/eg; $form =~ s/ë/chr(235)/eg; $form =~ s/ì/chr(236)/eg; $form =~ s/í/chr(237)/eg; $form =~ s/î/chr(238)/eg; $form =~ s/ï/chr(239)/eg; $form =~ s/ð/chr(240)/eg; $form =~ s/ñ/chr(241)/eg; $form =~ s/ò/chr(242)/eg; $form =~ s/ó/chr(243)/eg; $form =~ s/ô/chr(244)/eg; $form =~ s/õ/chr(245)/eg; $form =~ s/ö/chr(246)/eg; $form =~ s/÷/chr(247)/eg; $form =~ s/ø/chr(248)/eg; $form =~ s/ù/chr(249)/eg; $form =~ s/ú/chr(250)/eg; $form =~ s/û/chr(251)/eg; $form =~ s/ü/chr(252)/eg; $form =~ s/ý/chr(253)/eg; $form =~ s/þ/chr(254)/eg; $form =~ s/ÿ/chr(255)/eg; } =head1 NAME Save this file to "mw", which stands for merriam-webster, then you can run it as "mw word" or "perl mw word" =head1 DESCRIPTION a simple web robot to look up a word from Merriam-Webster site using POST method, and print the text response to STDOUT. =head1 README A special-purpose simple script that looks up a word from Merriam-Webster site. This script only uses Socket and no other external modules or packages, and it demonstrates the use of POST method to submit a FORM. However, the specific use of this script is limited to talking to www.m-w.com, and the fact that many parameters are hard-coded makes it dependent on the stability of that web site. Nonetheless, since everything is explicitly written, it is very easy to manually change those hard-coded strings. Version 2.0 adds the ability to look up a misspelled word, and changes the behavior when the user is presented a selection of words from displaying the menu once to always looping back to prompt the user again with the menu. Version 2.1 adapts to a small change in www.m-w.com Version 2.2 adapts to another small change in www.m-w.com, also fixes \r\n to \015\012 per advice of Sean M. Burke at sburke@cpan.org Version 2.2.1 The following statement is added per advice of Ed Avis <ed@membled.com> This program should be freely copied, modified, distributed along with a clearly written revision history, without affecting the license of users' other programs. =head1 PREREQUISITES requires strict module and Socket module =head1 SCRIPT CATEGORIES Web =cut