| 1 | #! /usr/bin/env python
|
|---|
| 2 |
|
|---|
| 3 | class Markov:
|
|---|
| 4 | def __init__(self, histsize, choice):
|
|---|
| 5 | self.histsize = histsize
|
|---|
| 6 | self.choice = choice
|
|---|
| 7 | self.trans = {}
|
|---|
| 8 |
|
|---|
| 9 | def add(self, state, next):
|
|---|
| 10 | self.trans.setdefault(state, []).append(next)
|
|---|
| 11 |
|
|---|
| 12 | def put(self, seq):
|
|---|
| 13 | n = self.histsize
|
|---|
| 14 | add = self.add
|
|---|
| 15 | add(None, seq[:0])
|
|---|
| 16 | for i in range(len(seq)):
|
|---|
| 17 | add(seq[max(0, i-n):i], seq[i:i+1])
|
|---|
| 18 | add(seq[len(seq)-n:], None)
|
|---|
| 19 |
|
|---|
| 20 | def get(self):
|
|---|
| 21 | choice = self.choice
|
|---|
| 22 | trans = self.trans
|
|---|
| 23 | n = self.histsize
|
|---|
| 24 | seq = choice(trans[None])
|
|---|
| 25 | while True:
|
|---|
| 26 | subseq = seq[max(0, len(seq)-n):]
|
|---|
| 27 | options = trans[subseq]
|
|---|
| 28 | next = choice(options)
|
|---|
| 29 | if not next:
|
|---|
| 30 | break
|
|---|
| 31 | seq += next
|
|---|
| 32 | return seq
|
|---|
| 33 |
|
|---|
| 34 |
|
|---|
| 35 | def test():
|
|---|
| 36 | import sys, random, getopt
|
|---|
| 37 | args = sys.argv[1:]
|
|---|
| 38 | try:
|
|---|
| 39 | opts, args = getopt.getopt(args, '0123456789cdwq')
|
|---|
| 40 | except getopt.error:
|
|---|
| 41 | print 'Usage: %s [-#] [-cddqw] [file] ...' % sys.argv[0]
|
|---|
| 42 | print 'Options:'
|
|---|
| 43 | print '-#: 1-digit history size (default 2)'
|
|---|
| 44 | print '-c: characters (default)'
|
|---|
| 45 | print '-w: words'
|
|---|
| 46 | print '-d: more debugging output'
|
|---|
| 47 | print '-q: no debugging output'
|
|---|
| 48 | print 'Input files (default stdin) are split in paragraphs'
|
|---|
| 49 | print 'separated blank lines and each paragraph is split'
|
|---|
| 50 | print 'in words by whitespace, then reconcatenated with'
|
|---|
| 51 | print 'exactly one space separating words.'
|
|---|
| 52 | print 'Output consists of paragraphs separated by blank'
|
|---|
| 53 | print 'lines, where lines are no longer than 72 characters.'
|
|---|
| 54 | sys.exit(2)
|
|---|
| 55 | histsize = 2
|
|---|
| 56 | do_words = False
|
|---|
| 57 | debug = 1
|
|---|
| 58 | for o, a in opts:
|
|---|
| 59 | if '-0' <= o <= '-9': histsize = int(o[1:])
|
|---|
| 60 | if o == '-c': do_words = False
|
|---|
| 61 | if o == '-d': debug += 1
|
|---|
| 62 | if o == '-q': debug = 0
|
|---|
| 63 | if o == '-w': do_words = True
|
|---|
| 64 | if not args:
|
|---|
| 65 | args = ['-']
|
|---|
| 66 |
|
|---|
| 67 | m = Markov(histsize, random.choice)
|
|---|
| 68 | try:
|
|---|
| 69 | for filename in args:
|
|---|
| 70 | if filename == '-':
|
|---|
| 71 | f = sys.stdin
|
|---|
| 72 | if f.isatty():
|
|---|
| 73 | print 'Sorry, need stdin from file'
|
|---|
| 74 | continue
|
|---|
| 75 | else:
|
|---|
| 76 | f = open(filename, 'r')
|
|---|
| 77 | if debug: print 'processing', filename, '...'
|
|---|
| 78 | text = f.read()
|
|---|
| 79 | f.close()
|
|---|
| 80 | paralist = text.split('\n\n')
|
|---|
| 81 | for para in paralist:
|
|---|
| 82 | if debug > 1: print 'feeding ...'
|
|---|
| 83 | words = para.split()
|
|---|
| 84 | if words:
|
|---|
| 85 | if do_words:
|
|---|
| 86 | data = tuple(words)
|
|---|
| 87 | else:
|
|---|
| 88 | data = ' '.join(words)
|
|---|
| 89 | m.put(data)
|
|---|
| 90 | except KeyboardInterrupt:
|
|---|
| 91 | print 'Interrupted -- continue with data read so far'
|
|---|
| 92 | if not m.trans:
|
|---|
| 93 | print 'No valid input files'
|
|---|
| 94 | return
|
|---|
| 95 | if debug: print 'done.'
|
|---|
| 96 |
|
|---|
| 97 | if debug > 1:
|
|---|
| 98 | for key in m.trans.keys():
|
|---|
| 99 | if key is None or len(key) < histsize:
|
|---|
| 100 | print repr(key), m.trans[key]
|
|---|
| 101 | if histsize == 0: print repr(''), m.trans['']
|
|---|
| 102 | print
|
|---|
| 103 | while True:
|
|---|
| 104 | data = m.get()
|
|---|
| 105 | if do_words:
|
|---|
| 106 | words = data
|
|---|
| 107 | else:
|
|---|
| 108 | words = data.split()
|
|---|
| 109 | n = 0
|
|---|
| 110 | limit = 72
|
|---|
| 111 | for w in words:
|
|---|
| 112 | if n + len(w) > limit:
|
|---|
| 113 | print
|
|---|
| 114 | n = 0
|
|---|
| 115 | print w,
|
|---|
| 116 | n += len(w) + 1
|
|---|
| 117 | print
|
|---|
| 118 | print
|
|---|
| 119 |
|
|---|
| 120 | if __name__ == "__main__":
|
|---|
| 121 | test()
|
|---|