; $Id: dptools.el,v 1.4 2003/05/29 20:43:46 traverso Exp traverso $
;;; (load "~/bin/dptools.el")
;;;
;;; remove the space at the end of lines
(defun de-space-at-eol()(interactive)
(replace-regexp " +$" "" nil (point-min-marker)(point-max-marker)))
;;; remove spaces before punctuations
(defun de-pre-punct-space()(interactive)
(replace-regexp " +\\([\.,!\?:;]\\)" "\\1"
nil (point-min-marker)(point-max-marker)))
;;; replace multiple spaces with one space
(defun replace-multiple-spaces()(interactive)
(replace-regexp " +" " " nil (point-min-marker)(point-max-marker)))
;;; replace some common OCR and windows particularities
(defun de-win ()(interactive)
(setq tob (point-min-marker))
(setq eob (point-max-marker))
;;; expand tabs
(untabify tob eob)
;;; remove spaces at end-of-line
(replace-regexp " +$" "" nil tob eob)
;;; replace a spaced hyphen with an em-dash
(replace-regexp " +- +" "--" nil tob eob)
;;; replace a spaced hyphen at the beginning of line with an em-dash
(replace-regexp "^- +" "--" nil tob eob)
;;; replace a spaced hyphen at the end of line with an em-dash
(replace-regexp " -$" "--" nil tob eob)
;;; replace some common chars in win codepage but not in iso-latin-1
(replace-string "†" "{\\dag}" nil tob eob)
(replace-string "‡" "{\\ddag}" nil tob eob)
(replace-regexp "Œ\\([A-Z]\\)" "OE\\1" nil tob eob)
(replace-regexp "Œ" "Oe" nil tob eob)
(replace-regexp "œ" "oe" nil tob eob)
(replace-regexp "—" "--" nil tob eob)
(replace-regexp "[€-Ÿ]" "*" nil tob eob))
;;; resolve the ae ligature
(defun de-ae()(interactive)
(replace-regexp "Æ\\([A-Z]\\)" "AE\\1" nil tob eob)
(replace-regexp "Æ" "Ae" nil tob eob)
(replace-regexp "æ" "ae" nil tob eob))
;;; prepare end-of-lines hyphens:
(defun de-hyp() (interactive)
(beginning-of-buffer)
(replace-regexp "--\n\\([a-z\"][^ \n]*\\) *\n" "--\\1\n")
(beginning-of-buffer)
(replace-regexp "-\n\\([A-ZÁ-Ý][^ \n]*\\) *\n" "-\\1\n")
(beginning-of-buffer)
(replace-regexp "-\n\\([a-zÀ-ÿ][^ \n]*\\) *\n" "\\1\n")
(beginning-of-buffer)
(replace-regexp "--\n\\([a-z\"][^ \n]*\\) +" "--\\1\n")
(beginning-of-buffer)
(replace-regexp "-\n\\([A-ZÁ-Ý][^ \n]*\\) +" "-\\1\n")
(beginning-of-buffer)
(replace-regexp "-\n\\([a-zÀ-ÿ][^ \n]*\\)[ \n]" "\\1\n"))
;;; pre-processing
(defun insert-empty-page()(interactive)
(if (< (point-max-marker) 4)(insert-string "\n(empty-page)\n")))
(defun pre-dp()(interactive)
(insert-empty-page)
(replace-string "\\\\" "**" nil (point-min-marker)(point-max-marker))
(goto-char (point-max))
(insert-string "\n")
(close-html)
(de-space-at-eol)
(de-pre-punct-space)
(de-win)
(replace-multiple-spaces)
(de-bold)
(shrink-guillemets)
(shrink-em-lines)
(de-hyp)
(save-buffer))
(defun shrink-em-lines()(interactive)
(replace-string " --" "--" nil (point-min-marker)(point-max-marker))
(replace-string "-- " "--" nil (point-min-marker)(point-max-marker)))
(defun de-bold()(interactive)
(replace-regexp "*b>" "" nil (point-min-marker)(point-max-marker)))
(defun de-it()(interactive)
(replace-regexp "*i>" "_" nil (point-min-marker)(point-max-marker)))
;;; mark for TeX
(defun pp-tex()(interactive)
(setq tob (point-min-marker))
(setq eob (point-max-marker))
(replace-regexp "*i>" "_" nil tob eob)
(replace-regexp "\\([^_]\\(__\\)+_\\)\\([^_]\\)" "\\1_\\3" nil tob eob)
(replace-regexp "\n+-+File: \\(...\\)\\.png-+\n+"
"\\\\PG{\\1}\n" nil tob eob)
(replace-string "*/" "\\=" nil tob eob)
(replace-string "/*" "\\_" nil tob eob)
(replace-string "[Footnote: " "\\Footnote[" nil tob eob)
(replace-string "[Sidenote: " "\\Sidenote[" nil tob eob))
(defun shrink-guillemets()(interactive)
(replace-string "« " "«" nil (point-min-marker)(point-max-marker))
(replace-string " »" "»" nil (point-min-marker)(point-max-marker)))
;;; pre-post-processing
(defun ppp()(interactive)
(de-space-at-eol)
(de-pre-punct-space)
(de-win)
(pp-tex))
;;; Tools for semiautomatic adjusting of page breaks
;;; adjust hyphens at end of page
(defun adjust-pb-hyphens()(interactive)
(beginning-of-buffer)
(query-replace-regexp " *-* *\\*\\\\PG{\\(...\\)}\n\\** *\\([^ ]*\\) *"
"\\2\\\\PG{\\1}\n"))
;;; query for possible paragraphs at top of page
(defun adjust-page-indents()(interactive)
(query-replace-regexp "}\n\\([-A-Z\"«]\\)" "}\n\n\\1"))
;;; rewrapping
;;; make regions between \_ and \= unwrappable
(defun make-unfold()(interactive)
(beginning-of-buffer)
(setq pad (make-string 70 160))
(while (search-forward "\\_" nil t)
(setq unfold-begin (point-marker))
(search-forward "\\=" nil t)
(end-of-line)(forward-char)(insert-string "ª")(backward-char)
(setq unfold-end (point-marker))
(replace-string " " (make-string 1 160) nil unfold-begin unfold-end)
(replace-regexp "$" pad nil unfold-begin unfold-end)
(goto-char unfold-end)
(delete-char 1) ))
;;; clean unwrappable regions after wrapping
(defun clean-unfold()(interactive)
(beginning-of-buffer)
(while (search-forward "\\_" nil t)
(setq unfold-begin (point-marker))
(search-forward "\\=" nil t)
(end-of-line)
(forward-char)
(setq unfold-end (point-marker))
(backward-char)
(replace-string (make-string 1 160) " " nil unfold-begin unfold-end)
(replace-regexp " +$" "" nil unfold-begin unfold-end)))
;;; rewrap and erase page markers
(defun pg-fill()(interactive)
(setq sentence-end-double-space nil)
(replace-regexp "\\\\PG{...}" "" nil (point-min-marker)(point-max-marker))
(make-unfold)
(fill-region (point-min-marker)(point-max-marker))
(clean-unfold) )
(defun pg-fill-2()(interactive)
(beginning-of-buffer)
(setq sentence-end-double-space nil)
(replace-regexp "\\\\PG{...}" "" nil (point-min-marker)(point-max-marker))
(beginning-of-buffer)
(protect-doubly-indented-lines)
(fill-region (point-min-marker)(point-max-marker))
(unprotect-lines))
;;; alternative: protect from rewrapping indented lines
(defun protect-indented-lines()(interactive)
(while (search-forward-regexp "^ " nil t)
(protect-one-line)))
;;; alternative: protect from rewrapping doubly indented lines
(defun protect-doubly-indented-lines()(interactive)
(while (search-forward-regexp "^ " nil t)
(protect-one-line)))
;;; replace non-breaking spaces with usual spaces, remove trailing spaces
(defun unprotect-lines()(interactive)
(replace-string (make-string 1 160) " " nil
(point-min-marker)(point-max-marker))
(replace-regexp " +$" "" nil
(point-min-marker)(point-max-marker)))
;;; protect the current line
(defun protect-one-line()(interactive)
(beginning-of-line)
(setq bol (point-marker))
(next-line 1)
(setq nl (point-marker))
(backward-char)
(setq ll (- nl bol))
(replace-string " " (make-string 1 160) nil bol nl)
(end-of-line)
(if (< ll 75) (insert-string (make-string (- 75 ll) 160))))
;;; remove pg-tex markups
(defun final-cleanup() (interactive)
(setq tob (point-min-marker))
(setq eob (point-max-marker))
; (replace-string (make-string 1 160) " " nil tob eob)
; (replace-regexp " +$" "" nil tob eob)
(replace-regexp "\\\\_.*\n" "" nil tob eob)
; (replace-regexp "/\\*.*\n" "" nil tob eob)
; (replace-regexp "\\*/.*\n" "" nil tob eob)
(replace-regexp "\\\\=.*\n" "" nil tob eob)
; (replace-string "\\Footnote[" "[Footnote: " nil tob eob)
; (replace-string "\\Sidenote[" "[Sidenote: " nil tob eob))
)
;;; restore ... in place of _
(defun re-html()(interactive)
(while (re-search-forward "\\([^\\_]\\)_\\([^_]\\)" nil t)
(replace-match "\\1\\2" nil nil)
(re-search-forward "\\([^\\_]\\)_\\([^_]\\)" nil t)
(replace-match "\\1\\2" nil nil)))
;;; common errors: no space after punctuation
;;; remove space before punctuation (interactively)
(defun query-de-pre-punct-space()(interactive)
(query-replace-regexp " +\\([\.,!\?:;]\\)" "\\1"))
;;; query add space after punctuation/letter
(defun add-post-space()(interactive)
(query-replace-regexp "\([\.,!\?:;]\)\([a-zA-Z]\)" "\\1 \\2"))
;;; replace tex accents with characters
(defun de-tex() (interactive)
(beginning-of-buffer)
(replace-string "\\'{E}" "É")
(beginning-of-buffer)
(replace-string "\\`{E}" "È")
(beginning-of-buffer)
(replace-string "\\^{E}" "Ê")
(beginning-of-buffer)
(replace-string "\\`{A}" "À")
(beginning-of-buffer)
(replace-string "\\^{A}" "Â")
(beginning-of-buffer)
(replace-string "\\c{C}" "Ç")
(beginning-of-buffer)
(replace-string "\\\"{E}" "Ë")
(beginning-of-buffer)
(replace-string "\\\"{I}" "Ï")
(beginning-of-buffer)
(replace-string "\\^{I}" "Î")
(beginning-of-buffer)
(replace-string "\\`{O}" "Ò")
(beginning-of-buffer)
(replace-string "\\^{O}" "Ô")
(beginning-of-buffer)
(replace-string "\\^{U}" "Û")
(beginning-of-buffer)
(replace-string "\\'{e}" "é")
(beginning-of-buffer)
(replace-string "\\`{e}" "è")
(beginning-of-buffer)
(replace-string "\\^{e}" "ê")
(beginning-of-buffer)
(replace-string "\\`{a}" "à")
(beginning-of-buffer)
(replace-string "\\^{a}" "â")
(beginning-of-buffer)
(replace-string "\\c{c}" "ç")
(beginning-of-buffer)
(replace-string "\\\"{e}" "ë")
(beginning-of-buffer)
(replace-string "\\\"{\\i}" "ï")
(beginning-of-buffer)
(replace-string "\\^{\\i}" "î")
(beginning-of-buffer)
(replace-string "\\`{o}" "ò")
(beginning-of-buffer)
(replace-string "\\^{o}" "ô")
(beginning-of-buffer)
(replace-string "\\^{u}" "û")
(beginning-of-buffer)
(replace-string "\\`{u}" "ù")
(beginning-of-buffer)
(replace-regexp "`` *" "\"")
(beginning-of-buffer)
(replace-regexp " *''" "\"")
(beginning-of-buffer)
(replace-string "\\\\" "\n")
(save-buffer))
(defun fs-to-ss() (interactive)
(replace-string "fs" "ß")
(save-buffer))
;;; Procedures:
;;;
;;; Use pre-dp (better with the shell command prePG) to pre-process
;;; the text files;
;;;
;;; Post-processing (with TeX):
;;;
;;; Use ppp for the automatic pre-post-processing; then use
;;; adjust-pb-hyphen and adjust-page-indents to manage page
;;; interruptions. Now page breaks should be OK: check them all.
;;;
;;; Use add-post-space to identify and correct missing
;;; space after punctuation.
;;;
;;; Now spell-check the file, then prepare a file namefile.tex,
;;; copying pg.tex and completing the line \input namefile.txt;
;;; prepare a pdf file with pdflatex namefile, and using namefile.pdf
;;; to read the book. Correct the source. Repeat pdflatex to refresh
;;; the corrections.
;;;
;;; Now execute pg-fill, control all the multiple blank lines, use
;;; gutcheck to find formatting problems, and execute final-cleanup.
;;;
;;; To recover a version with ... use re-html
;;;
;;; this part is used to strip html markup from a FineReader file,
;;; keeping bold and italic marks.
(defun html-to-block()(interactive)
(replace-regexp "
" "\n" nil
(point-min-marker)(point-max-marker))
(replace-regexp "<\\(/?[ib]\\)>" "[\\1]" nil
(point-min-marker)(point-max-marker)))
(defun block-to-html()(interactive)
(replace-regexp "\\[\\(/?[ib]\\)\\]" "<\\1>" nil
(point-min-marker)(point-max-marker)))
(defun erase-html()(interactive)
(replace-regexp "<[^<>]*>" "" nil
(point-min-marker)(point-max-marker)))
(defun erase-fr-filename()(interactive)
(replace-regexp ".*[0-9][0-9][0-9][0-9]\.htm" "" nil
(point-min-marker)(point-max-marker)))
(defun replace-html-ents()(interactive)
(setq tob (point-min-marker))
(setq eob (point-max-marker))
(replace-regexp "<\\(/?\\)I>" "<\\1i>" nil tob eob)
(replace-regexp "<\\(/?\\)B>" "<\\1b>" nil tob eob)
(replace-regexp "\\([ \n]+\\)\\([ib]\\)>" "\\2>\\1" nil tob eob)
(replace-string "<" "<" nil tob eob)
(replace-string ">" ">" nil tob eob)
(replace-string """ "\"" nil tob eob)
(replace-string " " (make-string 1 160) nil tob eob)
(replace-string "¡" (make-string 1 161) nil tob eob)
(replace-string "¢" (make-string 1 162) nil tob eob)
(replace-string "£" (make-string 1 163) nil tob eob)
(replace-string "¤" (make-string 1 164) nil tob eob)
(replace-string "¥" (make-string 1 165) nil tob eob)
(replace-string "¦" (make-string 1 166) nil tob eob)
(replace-string "§" (make-string 1 167) nil tob eob)
(replace-string "¨" (make-string 1 168) nil tob eob)
(replace-string "©" (make-string 1 169) nil tob eob)
(replace-string "ª" (make-string 1 170) nil tob eob)
(replace-string "«" (make-string 1 171) nil tob eob)
(replace-string "¬" (make-string 1 172) nil tob eob)
(replace-string "" (make-string 1 173) nil tob eob)
(replace-string "®" (make-string 1 174) nil tob eob)
(replace-string "¯" (make-string 1 175) nil tob eob)
(replace-string "°" (make-string 1 176) nil tob eob)
(replace-string "±" (make-string 1 177) nil tob eob)
(replace-string "²" (make-string 1 178) nil tob eob)
(replace-string "³" (make-string 1 179) nil tob eob)
(replace-string "´" (make-string 1 180) nil tob eob)
(replace-string "µ" (make-string 1 181) nil tob eob)
(replace-string "¶" (make-string 1 182) nil tob eob)
(replace-string "·" (make-string 1 183) nil tob eob)
(replace-string "¸" (make-string 1 184) nil tob eob)
(replace-string "¹" (make-string 1 185) nil tob eob)
(replace-string "º" (make-string 1 186) nil tob eob)
(replace-string "»" (make-string 1 187) nil tob eob)
(replace-string "¼" (make-string 1 188) nil tob eob)
(replace-string "½" (make-string 1 189) nil tob eob)
(replace-string "¾" (make-string 1 190) nil tob eob)
(replace-string "¿" (make-string 1 191) nil tob eob)
(replace-string "À" (make-string 1 192) nil tob eob)
(replace-string "Á" (make-string 1 193) nil tob eob)
(replace-string "Â" (make-string 1 194) nil tob eob)
(replace-string "Ã" (make-string 1 195) nil tob eob)
(replace-string "Ä" (make-string 1 196) nil tob eob)
(replace-string "Å" (make-string 1 197) nil tob eob)
(replace-string "Æ" (make-string 1 198) nil tob eob)
(replace-string "Ç" (make-string 1 199) nil tob eob)
(replace-string "È" (make-string 1 200) nil tob eob)
(replace-string "É" (make-string 1 201) nil tob eob)
(replace-string "Ê" (make-string 1 202) nil tob eob)
(replace-string "Ë" (make-string 1 203) nil tob eob)
(replace-string "Ì" (make-string 1 204) nil tob eob)
(replace-string "Í" (make-string 1 205) nil tob eob)
(replace-string "Î" (make-string 1 206) nil tob eob)
(replace-string "Ï" (make-string 1 207) nil tob eob)
(replace-string "Ð" (make-string 1 208) nil tob eob)
(replace-string "Ñ" (make-string 1 209) nil tob eob)
(replace-string "Ò" (make-string 1 210) nil tob eob)
(replace-string "Ó" (make-string 1 211) nil tob eob)
(replace-string "Ô" (make-string 1 212) nil tob eob)
(replace-string "Õ" (make-string 1 213) nil tob eob)
(replace-string "Ö" (make-string 1 214) nil tob eob)
(replace-string "×" (make-string 1 215) nil tob eob)
(replace-string "Ø" (make-string 1 216) nil tob eob)
(replace-string "Ù" (make-string 1 217) nil tob eob)
(replace-string "Ú" (make-string 1 218) nil tob eob)
(replace-string "Û" (make-string 1 219) nil tob eob)
(replace-string "Ü" (make-string 1 220) nil tob eob)
(replace-string "Ý" (make-string 1 221) nil tob eob)
(replace-string "Þ" (make-string 1 222) nil tob eob)
(replace-string "ß" (make-string 1 223) nil tob eob)
(replace-string "à" (make-string 1 224) nil tob eob)
(replace-string "á" (make-string 1 225) nil tob eob)
(replace-string "â" (make-string 1 226) nil tob eob)
(replace-string "ã" (make-string 1 227) nil tob eob)
(replace-string "ä" (make-string 1 228) nil tob eob)
(replace-string "å" (make-string 1 229) nil tob eob)
(replace-string "æ" (make-string 1 230) nil tob eob)
(replace-string "ç" (make-string 1 231) nil tob eob)
(replace-string "è" (make-string 1 232) nil tob eob)
(replace-string "é" (make-string 1 233) nil tob eob)
(replace-string "ê" (make-string 1 234) nil tob eob)
(replace-string "ë" (make-string 1 235) nil tob eob)
(replace-string "ì" (make-string 1 236) nil tob eob)
(replace-string "í" (make-string 1 237) nil tob eob)
(replace-string "î" (make-string 1 238) nil tob eob)
(replace-string "ï" (make-string 1 239) nil tob eob)
(replace-string "ð" (make-string 1 240) nil tob eob)
(replace-string "ñ" (make-string 1 241) nil tob eob)
(replace-string "ò" (make-string 1 242) nil tob eob)
(replace-string "ó" (make-string 1 243) nil tob eob)
(replace-string "ô" (make-string 1 244) nil tob eob)
(replace-string "õ" (make-string 1 245) nil tob eob)
(replace-string "ö" (make-string 1 246) nil tob eob)
(replace-string "÷" (make-string 1 247) nil tob eob)
(replace-string "ø" (make-string 1 248) nil tob eob)
(replace-string "ù" (make-string 1 249) nil tob eob)
(replace-string "ú" (make-string 1 250) nil tob eob)
(replace-string "û" (make-string 1 251) nil tob eob)
(replace-string "ü" (make-string 1 252) nil tob eob)
(replace-string "ý" (make-string 1 253) nil tob eob)
(replace-string "þ" (make-string 1 254) nil tob eob)
(replace-string "ÿ" (make-string 1 255) nil tob eob)
(replace-string "ˆ" "^" nil tob eob)
(replace-string "˜" "~" nil tob eob)
(replace-string "&" "&" nil tob eob) )
(defun abby-to-txt()(interactive)
(html-to-block)
(erase-html)
(block-to-html)
(replace-html-ents)
(erase-fr-filename)
(save-buffer))
(defun close-html()(interactive)
(replace-regexp "\\([ \n.!?,;:]+\\)\\([bi]>\\)" "\\2\\1"
nil (point-min-marker)(point-max-marker)))
(defun de-8() (interactive)
(beginning-of-buffer)
(replace-string "ç" "c")
(beginning-of-buffer)
(replace-regexp "^» *" "\"")
(beginning-of-buffer)
(replace-regexp "« *" "\"")
(beginning-of-buffer)
(replace-regexp " *»" "\"")
(beginning-of-buffer)
(replace-regexp "[à-å]" "a")
(beginning-of-buffer)
(replace-regexp "[è-ë]" "e")
(beginning-of-buffer)
(replace-regexp "[ì-ï]" "i")
(beginning-of-buffer)
(replace-regexp "[ò-ö]" "o")
(beginning-of-buffer)
(replace-regexp "[ù-ü]" "u")
(beginning-of-buffer)
(replace-string "ð" "dh")
(beginning-of-buffer)
(replace-string "þ" "th")
(beginning-of-buffer)
(replace-string "ý" "y")
(beginning-of-buffer)
(replace-string "ÿ" "y")
(beginning-of-buffer)
(replace-string "ñ" "n")
(beginning-of-buffer)
(replace-string "æ" "ae")
(beginning-of-buffer)
(replace-string "°" "o")
(beginning-of-buffer)
(replace-string "§" "s.")
(save-buffer))
(defun de-8-de () (interactive)
(beginning-of-buffer)
(replace-string "Ä" "Ae")
(beginning-of-buffer)
(replace-string "Ö" "Oe")
(beginning-of-buffer)
(replace-string "Ü" "Ue")
(beginning-of-buffer)
(replace-string "ä" "ae")
(beginning-of-buffer)
(replace-string "ö" "oe")
(beginning-of-buffer)
(replace-string "ü" "ue")
(beginning-of-buffer)
(replace-string "ß" "ss"))
(defun de-8-it()(interactive)
(replace-regexp "à\\([^a-z]\\)" "a`\\1"
nil (point-min-marker)(point-max-marker))
(replace-regexp "è\\([^a-z]\\)" "e`\\1"
nil (point-min-marker)(point-max-marker))
(replace-regexp "é\\([^a-z]\\)" "e'\\1"
nil (point-min-marker)(point-max-marker))
(replace-regexp "ì\\([^a-z]\\)" "i`\\1"
nil (point-min-marker)(point-max-marker))
(replace-regexp "ù\\([^a-z]\\)" "u`\\1"
nil (point-min-marker)(point-max-marker))
(replace-regexp "ò\\([^a-z]\\)" "o`\\1"
nil (point-min-marker)(point-max-marker)))