source: vendor/python/2.5/Tools/webchecker/websucker.py

Last change on this file was 3225, checked in by bird, 18 years ago

Python 2.5

File size: 3.3 KB
Line 
1#! /usr/bin/env python
2
3"""A variant on webchecker that creates a mirror copy of a remote site."""
4
5__version__ = "$Revision: 28654 $"
6
7import os
8import sys
9import urllib
10import getopt
11
12import webchecker
13
14# Extract real version number if necessary
15if __version__[0] == '$':
16 _v = __version__.split()
17 if len(_v) == 3:
18 __version__ = _v[1]
19
20def main():
21 verbose = webchecker.VERBOSE
22 try:
23 opts, args = getopt.getopt(sys.argv[1:], "qv")
24 except getopt.error, msg:
25 print msg
26 print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
27 return 2
28 for o, a in opts:
29 if o == "-q":
30 verbose = 0
31 if o == "-v":
32 verbose = verbose + 1
33 c = Sucker()
34 c.setflags(verbose=verbose)
35 c.urlopener.addheaders = [
36 ('User-agent', 'websucker/%s' % __version__),
37 ]
38 for arg in args:
39 print "Adding root", arg
40 c.addroot(arg)
41 print "Run..."
42 c.run()
43
44class Sucker(webchecker.Checker):
45
46 checkext = 0
47 nonames = 1
48
49 # SAM 11/13/99: in general, URLs are now URL pairs.
50 # Since we've suppressed name anchor checking,
51 # we can ignore the second dimension.
52
53 def readhtml(self, url_pair):
54 url = url_pair[0]
55 text = None
56 path = self.savefilename(url)
57 try:
58 f = open(path, "rb")
59 except IOError:
60 f = self.openpage(url_pair)
61 if f:
62 info = f.info()
63 nurl = f.geturl()
64 if nurl != url:
65 url = nurl
66 path = self.savefilename(url)
67 text = f.read()
68 f.close()
69 self.savefile(text, path)
70 if not self.checkforhtml(info, url):
71 text = None
72 else:
73 if self.checkforhtml({}, url):
74 text = f.read()
75 f.close()
76 return text, url
77
78 def savefile(self, text, path):
79 dir, base = os.path.split(path)
80 makedirs(dir)
81 try:
82 f = open(path, "wb")
83 f.write(text)
84 f.close()
85 self.message("saved %s", path)
86 except IOError, msg:
87 self.message("didn't save %s: %s", path, str(msg))
88
89 def savefilename(self, url):
90 type, rest = urllib.splittype(url)
91 host, path = urllib.splithost(rest)
92 path = path.lstrip("/")
93 user, host = urllib.splituser(host)
94 host, port = urllib.splitnport(host)
95 host = host.lower()
96 if not path or path[-1] == "/":
97 path = path + "index.html"
98 if os.sep != "/":
99 path = os.sep.join(path.split("/"))
100 if os.name == "mac":
101 path = os.sep + path
102 path = os.path.join(host, path)
103 return path
104
105def makedirs(dir):
106 if not dir:
107 return
108 if os.path.exists(dir):
109 if not os.path.isdir(dir):
110 try:
111 os.rename(dir, dir + ".bak")
112 os.mkdir(dir)
113 os.rename(dir + ".bak", os.path.join(dir, "index.html"))
114 except os.error:
115 pass
116 return
117 head, tail = os.path.split(dir)
118 if not tail:
119 print "Huh? Don't know how to make dir", dir
120 return
121 makedirs(head)
122 os.mkdir(dir, 0777)
123
124if __name__ == '__main__':
125 sys.exit(main() or 0)
Note: See TracBrowser for help on using the repository browser.