#!/opt/local/bin/gawk -f # # bbl2html.awk v1.2c # # Released to the public domain (ie. use at your own risk) # Rik Blok <rikblok@mail.com> # December 13, 2000. # # Latest version available from # http://rikblok.cjb.net/scripts/bbl2html.awk # # Converts a LaTeX .bbl file to (mostly) formatted html code. Probably # also works if applied directly to a .tex file. Sets bookmarks # to the keys so you can reference a citation from another page, eg. # <a href="bib.html#key">[1]</a> will make a link to "key". # # bbl2html will use the default label unless you set (on the command-line) # override=key or override=number in which case it will use the citation # key or numeric format, respectively. # # I wrote this script out of dissatisfaction with other conversion tools # available. Hopefully it will be of use to somebody. Feel free to # modify the script to suit you. I've tested it with the bibliography # styles abbrv, alpha, apalike, ieeetr, plain, prsty, siam, and unsrt # and it works fairly well. # For a sample of the output visit http://rikblok.cjb.net/lib/refs.html. # # Usage: # awk -f bbl2html.awk head=<header> foot=<footer> \ # [override = key|number] [labelwidth=<width>] # [bigtable = 0|1] [noabout=0|1] <infile> > <outfile> # # where # <header> and <footer> may be formatted text (enclosed in escaped # quotes if containing a space) to be placed at the beginning and end # of the output, respectively. As a special case, if either begins with # the symbol "@" it is assumed to be a filename and the text is read from # the file specified. If neither a header nor a footer is specified # <html><body> and </body></html> are used, respectively; # override is an optional variable to change the displayed label to # the citation key or numeric format; # <width> is the width of the column label, in pixels or, if appended # by "%", in percent (optional, defaults to 50 (pixels)); # bigtable is an optional variable which allows the page to be # formatted as one big table (=1) or a separate table for each entry (=0) # (multiple tables can be displayed incrementally as the page loads but # a single table will be rendered faster. Optional, defaults to 0); # noabout is an optional variable which tells bbl2html.awk not to # print the "Generated by bbl2html.awk..." comment at the bottom of the # page (optional, defaults to 0); # <infile> is the name of the bibliography file (or LaTeX file?) to # be converted; and # <outfile> is the name of the html file to be generated. # # Sample usages: # awk -f bbl2html.awk head=\<html\>\<body\> foot=\</body\>\</html\> bib.bbl > bib.html # awk -f bbl2html.awk head=@bib.head foot=@bib.foot override=key bib.bbl > bib.html # awk -f bbl2html.awk -f myOwnSubstitutions.awk bib.bbl > bib.html # awk -f bbl2html.awk labelwidth=20% bigtable=1 bib.bbl > bib.html # # Notes: # # 1) You can add your own substitutions fairly easily by setting # userfind[] and userreplace[] in the BEGIN action. You can also # place the substitutions in a separate awk file (within a BEGIN # action) to avoid modifying this script (see the last example above # for a sample usage). # # 2) To generate a complete list of citations from a bibtex file # myreferences.bib use the bbl file generated by this latex file: # # %%%% begin latex file # \documentclass{article} # \usepackage{url} % if your citations have any \url{} commands # \begin{document} # \nocite{*} # \bibliographystyle{unsrt} % use whichever style you prefer # \bibliography{myreferences} % use myreferences.bib # \end{document} # %%%% end latex file # # 3) bbl2html.awk needs GNU awk/gawk. On Solaris machines, neither # /usr/bin/awk nor /usr/bin/nawk work. # # 4) If you download this file make sure it has the proper line-endings # for your filesystem. Otherwise running the script will probably # generate an "^ Invalid char" error. On Unix, process the script # with dos2unix if you encounter this error. # # 5) Any occurrences of "<" and ">" must be escaped in header # and footer. Eg. head=\<html\>\<body\> foot=\</body\>\<html\> # (not needed in files header/footer may point to). # # Revisions: # v1.2c December 13, 2000 # - added Unix shell header (!/opt/local/bin/gawk -f) to run script # as a shell command. The path may need to be modified on your # machine. # - labelwidth now defaults to pixels. Append a percent symbol to # use as percent (eg. "labelwidth=20%"). # - more linebreaks allowed in urls # - fixed: end-of-line comments ("%\n") left in txt of \href{url}{txt} # - adds "Generated by bbl2html.awk..." comment at bottom of file # (can be disabled with "noabout=1" command-line parameter) # v1.2b December 11, 2000 # - fixed: chokes on \href{url}%\n{text} # - basic math support (italics, super- and sub-scripts) # - now has bookmarks (<a name="...">) for both keys and labels # (if different) # v1.2 December 7, 2000 # - basic support for \href{url}{txt} (tries to guess how to format # 'txt', either as text or as an url) # - if neither header nor footer specified, defaults to # head="<html><body>" and foot="</body></html>" # v1.1 November 5, 2000 # - added labelwidth option (percentage) # - rudimentary support for smallcaps # - added notes 4 and 5 # v1.0c August 25, 2000 # - replaced userfind[]/userreplace[] indices with descriptive keys # v1.0b July 31, 2000 # - replaceFormat() checks for multiple occurrences of formatting # - now handles more general keys # - supports most bibliography styles (that I know of) # - defaults to using whatever labels are supplied, or numeric (if none) # - can override label with override=key or override=number # v1.0 July 28, 2000 # - initial release # # To do: # - nothing urgent # # Thanks to: # - Marc Mutz for bug-hunting and the math substitutions #-------------------------------------------------------------------------- BEGIN { # bbl2html.awk information version = "1.2c"; home = "http://rikblok.cjb.net/scripts/index.html#bbl2html.awk"; # put user-defined substitutions here # arXiv.org preprint archive userfind["arxiv"] = "arXiv:([a-zA-Z\.\-]+\/[0-9][0-9][0-9][0-9][0-9][0-9][0-9])"; userreplace["arxiv"] = "<a href=\"http://arXiv.org/abs/\\1\"><tt>arXiv:\\1</tt></a>"; # siam style uses a horizontal line in place of repeating author names userfind["siam1"] = "\\\\leavevmode\\\\vrule height 2pt depth -1.6pt width 23pt"; userreplace["siam1"] = "<strike>\\ \\ \\ \\ \\ \\ \\ \\ </strike>"; # some trivial math userfind["math"] = "([^\\\\])\\$([^\\$]*)\\$"; userreplace["math"] = "\\1<var>\\2</var>"; # for super and subscripts: userfind["math.sub"] = "([^\\\\])(\\$|<var>)(.*)_([a-zA-Z0-9]|{.*}|\\\\[a-zA-Z]+)([^\\$]*)(\\$|</var>)"; userreplace["math.sub"] = "\\1<var>\\3<sub>\\4</sub>\\5</var>"; userfind["math.sup"] = "([^\\\\])(\\$|<var>)(.*)\\^([a-zA-Z0-9]|{.*}|\\\\[a-zA-Z]+)([^\\$]*)(\\$|</var>)"; userreplace["math.sup"] = "\\1<var>\\3<sup>\\4</sup>\\5</var>"; # strip out everything before "\begin{thebibliography}" while (line !~/\\begin{thebibliography}/) getline line; NR=0; RS = "\\\\bibitem"; # record separator = "\bibitem" FS = "\\\\newblock[ \n]+"; # field separator = "\newblock " } NR == 1 { # on begin, after reading command-line parameters if (!head && !foot) { # if both undefined then use defaults head = "<html><body>"; foot = "</body></html>"; } # put in header printhf(head); # default labelwidth = 50 (pixels) if (!labelwidth) labelwidth = 50; if (bigtable) print "<table width=\"100%\">"; } # every record { label = ""; # erase label } # leading "[", set label and strip "[...]" from $1 $1 ~ /^\[/ { right = matchBrace($1,1); label = substr($1,2,right-2); # alpha style sub(/{\\etalchar{\+}}/,"+",label); # replace "{\etalchar{+}}" --> "+" # apalike style label = authorFormat(label); # strip label from line $1 = substr($1,right+1); } # leading "{" $1 ~ /^{/ { keycount++; # if label not already set or override then set to number if (!label || override=="number") label = keycount; # get length of key from $1 right = matchBrace($1,1); # set bookmark to key key = substr($1,2,right-2); # use key as label? if (override=="key") label = key; if (!bigtable) print "<table width=\"100%\">"; print "<tr><td width=\"" labelwidth "\" valign=\"top\">"; printf("<a name=\"" key "\">"); print "[" label "]"; if (key != label) printf("<a name=\"" label "\">"); printf("</td><td"); # fixing width=100% looks better when using multiple tables if (!bigtable) printf(" width=\"100%\""); printf(">"); # strip key out of first line line = substr($1,right+1); # process each line lineno=1; while (lineno<=NF) { # if last line then check for "\end{thebibliography}" if (lineno==NF) sub(/\n\\end{thebibliography}/,"",line); # first take out any urls before any more processing while ((left=match(line,/\\url{/))>0) { right= matchBrace(line,RSTART+4); if (right>left) { urlcnt++; url[urlcnt] = substr(line,left+5,right-left-5); # assumes just one url per line line = substr(line,1,left-1) "__URL" urlcnt "__" substr(line,right+1); } } # repeat for hrefs while ((left=match(line,/\\href{/))>0) { right= matchBrace(line,RSTART+5); if (right>left) { urlcnt++; url[urlcnt] = substr(line,left+6,right-left-6); # assumes just one url per line # now find href text, starting with next '{' if ((left2 = match(substr(line,right+1),/{/))>0) { left2 += right; right = matchBrace(line,left2); txt[urlcnt] = substr(line,left2+1,right-left2-1); } # replace line = substr(line,1,left-1) "__URL" urlcnt "__" substr(line,right+1); } } line = authorFormat(line); # re-insert formatted urls while (urlcnt) { url[urlcnt] = urlFormat(url[urlcnt],txt[urlcnt]); urlmark = "__URL" urlcnt "__"; # can't use sub() because url[urlcnt] may contain "&" # sub(urlmark,url[urlcnt],line); if (match(line,urlmark)) { line = substr(line,1,RSTART-1) url[urlcnt] substr(line,RSTART+RLENGTH); } urlcnt--; } print line "<br>"; # get ready for next line lineno++; line = $lineno; } printf("</td></tr>"); if (!bigtable) print "</table>"; } END { if (bigtable) print "</table>"; if (!noabout) { print "<hr><font size=\"-1\"><address>Generated by <a href=\"" home "\">bbl2html.awk</a> v" version "</address></font>"; } if (foot) printhf(foot); } #-------------------------------------------------------------------------- function authorFormat( s, left,right) # substitute accents in author-type string s. { # first replace small-caps formatting style so accents are handled properly s = replaceFormatSC(s); # next, replace accents s = gensub(/\\'([AEIOUYaeiouy])/, "\\&\\1acute;", "g", s); s = gensub(/\\`([AEIOUaeiou])/, "\\&\\1grave;", "g", s); s = gensub(/\\\^([AEIOUaeiou])/, "\\&\\1circ;", "g", s); s = gensub(/\\~([AEINOUaeinou])/, "\\&\\1tilde;", "g", s); s = gensub(/\\[\.]([AEIOUaeiou])/, "\\&\\1ring;", "g", s); s = gensub(/\\\"([AEIOUaeiou])/, "\\&\\1uml;", "g", s); s = gensub(/\\([Oo])/, "\\&\\1slash;", "g", s); s = gensub(/\\(AE|ae)/, "\\&\\1lig;", "g", s); gsub(/\\ss/,"\\ß",s); # German sharp s gsub(/~/,"\\ ",s); # replace nonbreaking spaces: ~ --> gsub(/\\[,@]/," ",s); # replace spaces gsub(/``|''/,"\\"",s); # replace quotes gsub(/---/,"-",s); # replace dashes gsub(/--/,"-",s); # these accents can't be displayed in HTML (with my charset) so delete 'em s = gensub(/\\[bcduvH]{([a-zA-Z])}/, "\\1", "g", s); # eg. \u{o} gsub(/{\\AA}/,"A",s); # \AA --> A gsub(/{\\aa}/,"a",s); # \aa --> a # user-defined substitutions for (i in userfind) { s = gensub(userfind[i], userreplace[i], "g", s); } # replace formatting styles s = replaceFormat(s,"\\em","<em>","</em>"); # replace emphasis s = replaceFormat(s,"\\bf","<b>","</b>"); # replace bold s = replaceFormat(s,"\\it","<i>","</i>"); # replace italics s = replaceFormat(s,"\\tt","<tt>","</tt>"); # replace teletype gsub(/{|}/,"",s); # drop any remaining braces gsub(/\\/,"",s); # drop any remaining slashes return s; } #-------------------------------------------------------------------------- function urlFormat( url, # function parameters display) # optional variables # Format a url. If 'display' is passed then try to determine if it should # be displayed formatted author-like or url-like. { # gsub(/\&/,"\\\\&",url); # escape "&"s (hmm, apparently I don't need this...) gsub(/ /,"",url); # strip spaces gsub(/%\n/,"",url); # strip end-of-line comments gsub(/\n/,"",url); # strip other linebreaks gsub(/%\n/,"",display); # also strip end-of-line comments in display # guess how to format display, either as an url or authorFormat() if (!display || tolower(display) ~ /:\/\/|^mailto:/) { if (!display) display = url; # default display = formatted url # allow linebreaks after punctuation symbols for display purposes display = "<tt>" gensub(/([^A-Za-z0-9 ])/,"\\1<wbr>","g",display) "</tt>"; } else { display = authorFormat(display); } return "<a href=\"" url "\">" display "</a>"; } #-------------------------------------------------------------------------- function printhf( s, # function parameters line) # local variables # Prints s (head or foot). If s starts with "@" then is assumed to # be a filename and prints the contents of the file { # if no leading "@" then just print s if (s !~ /^@/) { print s; return; } # else print contents of file s s = substr(s,2); # drop leading "@" while ((getline line < s) > 0) print line; close(s); } #-------------------------------------------------------------------------- function replaceFormat( s,find,replaceleft,replaceright, # function parameters left, right) # local variables # Replace formatting style marks. Use to change things like {\it et al.} # into {<it> et al.</it>} with the usage # s = replaceFormat(s,"\\it","<i>","</i>"); { while ((left = index(s,find))>0) { right = matchBrace(s,left,"{"); # find "}" which matches assumed "{" at position 'left' if (right>left) { s = substr(s,1,right-1) replaceright substr(s,right); } s = substr(s,1,left-1) replaceleft substr(s,left+length(find)); } return s; } #-------------------------------------------------------------------------- function replaceFormatSC( s, # function parameters find,replaceleft,replaceright, # local variables left, right,l,c,r,cout,i,capslock,ch) # Replace small caps "{\sc ...}" formatting style marks. # Should be called before accents are replaced (so that "\'a" --> "\'A", # for example, instead of "á" --> "&AACUTE;".) # This routine is not robust, it assumes only a small subset of LaTeX # commands (such as accents) will be found in the text s. Unanticipated # commands will probably be changed to uppercase (but, for now, this can # probably be corrected with userfind[]/userreplace[] substitutions). { find = "\\sc"; replaceleft = "<font size=\"-1\">"; replaceright= "</font>"; while ((left = index(s,find))>0) { right = matchBrace(s,left,"{"); # find "}" which matches assumed "{" at position 'left' if (right<left) right = length(s); # split s into parts l = substr(s,1,left-1); # left left += length(find); c = substr(s,left,right-left+1) r = substr(s,right+1); # manipulate c capslock=1; # start in uppercase cout = ""; for (i=1; i<=length(c); i++) { ch = substr(c,i,1); if (ch ~ /[a-z]/) { if (capslock) { ch = replaceleft toupper(ch); capslock = 0; } else ch = toupper(ch); } else if (ch ~ /[A-Z0-9]/) { if (!capslock) { ch = replaceright ch; capslock = 1; } } cout = cout ch; } if (!capslock) cout = cout replaceright; # correct mangled accents gsub(/\\B{/, "\\b{", cout); gsub(/\\C{/, "\\c{", cout); gsub(/\\D{/, "\\d{", cout); gsub(/\\U{/, "\\u{", cout); gsub(/\\V{/, "\\v{", cout); # recombine s s = l cout r; } return s; } #@include matchBrace.awk #-------------------------------------------------------------------------- # matchBrace.awk - library containing single function matchBrace() function matchBrace( s,i, # function parameters open, # optional parameters brace,depth,left,right,either,pos,start) # local variables # Finds the matching brace for the one at index i (or assume brace==open, # if specified) in string s. Returns # index of matching brace or zero if not found. { # error trap if (!i) return 0; # if open not specified then read from substr(s,i,1) if (!open) open = substr(s,i,1); # identify type of braces and put in left and right left = "([{<`)]}>'"; right= ")]}>'([{<`"; pos = index(left,open); if (!pos) { # not in list of braces left = open; # set left and right to the same thing right= left; either = "[" left "|" right "]"; # regexp (not escaped) } else { left = substr(left, pos,1); # found in list, set match right= substr(right,pos,1); either = "[\\" left "|\\" right "]"; # regexp (escaped) } # find matching brace pos = i; depth = 1; while (depth) { start += pos; s = substr(s,pos+1); if ((pos = match(s,either))>0) { # another brace found if (substr(s,pos,1)==right) depth--; else depth++; } else return 0; # no more braces, return zero } return start+pos; } #--------------------------------------------------------------------------