dbContentReader._pike
Go to the documentation of this file.
1 /*0*/
2 class dbContentReader {
3 public:
4 int documents;
5 int transfer;
6 int homeid;
7 
8 void read_file_from_path(object db, string path, object file)
9 {
10  array result = db->query(
11  "select ob_id from ob_data where ob_attr='OBJ_PATH' AND ob_data='\""+
12  path+"\"';");
13  if (sizeof(result)>0) {
14  int oid = result[0]["ob_id"];
15  read_file_from_oid(db, oid, file);
16  }
17  else
18  werror("Unable to retrieve file %s - path not found!\n", path);
19 }
20 
21 void read_file_from_oid(object db, int oid, object file)
22 {
23  array doc_id = db->query(
24  "select ob_data from ob_data where ob_attr='CONTENT_ID' AND ob_id='"+
25  oid+"';");
26  if (sizeof(doc_id)>0)
27  read_file(db, (int)doc_id[0]["ob_data"], file);
28  else
29  werror("Unable to retrieve file %O - no Document!\n", oid);
30 }
31 
32 string get_path(object db, int oid)
33 {
34  array res = db->query("select ob_data from ob_data where ob_id="+
35  oid + " and ob_attr='OBJ_PATH'");
36  if (sizeof(res)==0) {
37  array env = db->query("select ob_data from ob_data where ob_id="+
38  oid + " and ob_attr='Environment'");
39  if (sizeof(env)>0) {
40  int envid;
41  sscanf(env[0]["ob_data"], "%%%d", envid);
42  if (envid==0) {
43  array creator = db->query("select ob_data from ob_data where ob_id="+oid+" AND ob_attr='Creator'");
44  if (sizeof(creator)>0) {
45  int creatorid;
46  sscanf(creator[0]["ob_data"], "%%%d", creatorid);
47  array wr = db->query("select ob_data from ob_data where ob_id="+creatorid + " AND (ob_attr='GROUP_WORKROOM' OR ob_attr='USER_WORKROOM')");
48  if (sizeof(wr)>0)
49  return "/home/"+get_name(db, creatorid);
50  }
51  }
52  else if (envid==oid) {
53  werror("Fatal error: Object %d is in itself!\n");
54  return "/void/"+oid;
55  }
56  return get_path(db, envid) + "/" + get_name(db, oid);
57  }
58  return 0;
59  }
60  string p = "";
61  sscanf(res[0]["ob_data"], "\"%s\"", p);
62  return p;
63 }
64 
65 string get_name(object db, int oid)
66 {
67  array res = db->query("select ob_data from ob_data where ob_id='"+
68  oid + "' and ob_attr='identifier'");
69  if (sizeof(res)==0)
70  return 0;
71  string name = res[0]["ob_data"];
72  int l = strlen(name);
73  return l>0?name[1..l-2]:name;
74 }
75 
76 void check_path(object db)
77 {
78  int fixed = 0;
79  int fail = 0;
80  array objects = db->query("select ob_id from ob_class where ob_class like '/classes/Doc%'");
81  if (sizeof(objects)>0) {
82  write("Checking path for " + sizeof(objects) + " objects !\n");
83  foreach(objects, mixed obj) {
84  int oid = (int)obj["ob_id"];
85  array res = db->query("select ob_data from ob_data where ob_id="+
86  oid + " and ob_attr='OBJ_PATH'");
87  if (sizeof(res)==0) {
88  string path = get_path(db, oid);
89  if (stringp(path)) {
90  db->query("update ob_data SET ob_data='\""+
91  db->quote(path)+"\"' where "+
92  "ob_id="+oid + " AND ob_attr='OBJ_PATH'");
93  fixed++;
94  }
95  else
96  fail++;
97  }
98  }
99  }
100  werror("Fixed %d Path (%d failed)\n", fixed, fail);
101 }
102 
103 string content_id_to_path(int content_id) {
104  if (content_id==0) return 0;
105  string path = sprintf("%05d", content_id);
106  int tmp_id = content_id >> 8;
107  do {
108  path = sprintf("%02x/",tmp_id&0xff)+path;
109  tmp_id = tmp_id >> 8;
110  } while (tmp_id>0);
111  return path;
112 }
113 
114 
115 
116 string mapPath(object db, int oid, string path) {
117  string bname=basename(path);
118  if ((string)((int)bname) == bname)
119  path += ".content";
120  return path;
121 }
122 
123 string mapPathId(object db, int oid, string path) {
124  array result = db->query(
125  "select ob_data from ob_data where ob_attr='CONTENT_ID' AND ob_id="
126  +oid);
127  if (sizeof(result)>0) {
128  int cid = (int)result[0]["ob_data"];
129  return content_id_to_path(cid);
130  }
131  else {
132  return 0;
133  }
134 }
135 
136 
137 void read_files_from_path(object db, string path, mapping params, function mapPathFunction)
138 {
139  if (!params->output) {
140  werror("You need to specify an output directory! (--output=)\n");
141  return;
142  }
143  array result = db->query(
144  "select ob_id, ob_data from ob_data where ob_attr='OBJ_PATH' AND "+
145  "ob_data like '\""+path+"%';");
146  if (sizeof(result)>0) {
147  write("Fetching %d Document from Database in Path %s\n", sizeof(result),
148  path);
149  for (int i = 0; i < sizeof(result); i++) {
150  int oid = (int)result[i]["ob_id"];
151  string p = (string)result[i]["ob_data"];
152  Stdio.mkdirhier(params->output);
153  if (params->output[-1]!='/')
154  params->output += "/";
155  sscanf(p, "\"%s\"", p);
156  p = p[1..];
157  if (functionp(mapPathFunction)) {
158  p = mapPathFunction(db, oid, p);
159  if (p==0)
160  continue;
161  }
162 
163  string name = params->output + replace(p, "/versions", "__versions");
164  name = replace(name, "/annotations", "__annotations");
165  array directory = name / "/";
166 
167  array classResult = db->query(
168  "select ob_class, obversionof from ob_class where ob_id='"+ oid + "';");
169  if (sizeof(classResult) > 0) {
170  string obclass = classResult[0]["ob_class"];
171  if (search(obclass, "/classes/Doc")==0 &&
172  classResult[0]["obversion"] == 0)
173  {
174  array doc_id = db->query(
175  "select ob_data from ob_data where ob_attr='CONTENT_ID' AND ob_id='"+
176  oid+"';");
177  if (sizeof(doc_id)>0) {
178  if ( sizeof(directory) > 1 ) {
179  Stdio.mkdirhier(directory[..sizeof(directory)-2] * "/");
180  }
181  string bname = basename(name);
182  // only integer name in path
183  write("Creating file " + name + "\n");
184  Stdio.File f = Stdio.File(name, "wct");
185  read_file(db, (int)doc_id[0]["ob_data"], f);
186  f->close();
187  }
188  }
189  else if (obclass == "/classes/Container" || obclass == "/classes/Room")
190  {
191  if (mapPathFunction==mapPathId)
192  continue;
193  Stdio.mkdirhier(name);
194  }
195  }
196  }
197  }
198  else
199  werror("Unable to retrieve files in %s - path not found!\n", path);
200 }
201 
202 void read_file(object db, int id, object file)
203 {
204  Sql.sql_result odbData = db->big_query("select rec_data from doc_data "+
205  "where doc_id="+id+" order by rec_order");
206 
207  documents++;
208  while (array line = odbData->fetch_row()) {
209  transfer+=strlen(line[0]);
210  file->write(line[0]);
211  }
212 }
213 
214 void main(int argc, array args) {
215  // params are --file= or --oid= or nothing and --output=<directory>
216  // or --files=
217  int tt = time();
218  documents = 0;
219  transfer = 0;
220  mapping params = ([ ]);
221  mapping mimetypes = ([
222  "image/jpeg":"jpg",
223  "image/gif":"gif",
224  "application/msword": "doc",
225  "application/pdf": "pdf",
226  "audio/mpeg": "mp3",
227  "image/bmp": "bmp",
228  "text/plain": "text",
229  "text/xml": "xml",
230  "image/tiff": "tiff",
231  "application/wnd.ms-powerpoint":"ppt",
232  "application/x-shockwave-flash": "swf",
233  "application/x-gzip":"zip",
234  "application/x-gtar": "gtar",
235  "application/x-tar": "tar",
236  "audio/x-pn-realaudio": "ra",
237  "audio/x-wav": "wav",
238  "image/svg": "svg",
239  "video/x-msvideo": "avi",
240  "video/x-ms-wmv": "wmv",
241  "application/vnd.ms-excel": "xls",
242  "source/pike": "pike",
243  "text/wiki": "wiki",
244  "text/html": "html",
245  "image/png": "png",
246  "application/x-javascript": "js",
247  ]);
248 
249  params["db"] = "mysql://steam:steam@localhost/steam";
250  for(int i=1; i<argc;i++) {
251  string type, val;
252  if (sscanf(args[i], "--%s=%s", type, val) >=2)
253  params[type] = val;
254  else if (sscanf(args[i], "--%s", type)>=1)
255  params[type] = 1;
256  }
257 
258  Sql.Sql db = Sql.Sql(params->db);
259 
260  if (params["check-path"]) {
261  check_path(db);
262  return;
263  }
264  if (params["oid"]) {
265  int oid;
266  if (sscanf(params["oid"], "%d", oid)>0) {
267  Stdio.File f = Stdio.File(oid+".file", "wct");
268  read_file(db, oid, f);
269  f->close();
270  return;
271  }
272  }
273  if (params["file"]) {
274  Stdio.File f = Stdio.File(basename(params->file), "wct");
275  read_file_from_path(db, params["file"], f);
276  f->close();
277  return;
278  }
279  if (params["files"]) {
280  if (params["mode"]=="hash")
281  read_files_from_path(db, params->files, params, mapPathId);
282  else
283  read_files_from_path(db, params->files, params, mapPath);
284  transfer = transfer / (1024*1024);
285  tt = max(time() - tt, 1);
286 
287  write("-- %d Documents in %d seconds, %d mb, %d mb/s", documents,
288  tt, transfer, transfer/tt);
289  return;
290  }
291 
292  write("Getting DOC IDs ....\n");
293  Sql.sql_result res = db->big_query("select distinct doc_id from doc_data");
294  array doc_ids = allocate(res->num_rows());
295  for(int i=0;i<sizeof(doc_ids);i++)
296  doc_ids[i]=(int)res->fetch_row()[0];
297  write("Found %d Data entries in Database...\n", sizeof(doc_ids));
298 
299  res = db->big_query("select distinct ob_data from ob_data where ob_attr='CONTENT_ID';");
300  mixed row;
301  array content_ids = allocate(res->num_rows());
302  write("Found %d Documents in Database ...\n", sizeof(content_ids));
303  for (int i=0;i<res->num_rows();i++) {
304  content_ids[i] = (int)res->fetch_row()[0];
305  }
306  array unallocated = doc_ids - content_ids;
307  write("There are %d lost entries in the Database!\n", sizeof(unallocated));
308 
309  if (params->output) {
310  write("Saving Files to %s\n", params->output);
311  string dirname = params->output;
312  Stdio.mkdirhier(dirname);
313  if ( dirname[-1] != '/')
314  dirname += "/";
315  foreach(unallocated, int docid) {
316  string fname = dirname + docid + ".file";
317  Stdio.File f = Stdio.File(fname, "wct");
318  read_file(db, docid, f);
319  f->close();
320  // try to get information
321  Stdio.File outfile = Stdio.File("mimetype.out.tmp", "wct");
322  int PCode = Process.create_process(
323  ({ "file", "-i", fname }),
324  ([ "env": getenv(),
325  "stdout" : outfile, ])
326  )->wait();
327  outfile->close();
328  string mimetype, ext;
329  ext = "file";
330  if (sscanf(Stdio.read_file("mimetype.out.tmp"),fname+": %s; %*s",mimetype) ||
331  sscanf(Stdio.read_file("mimetype.out.tmp"),fname+": %s, %*s",mimetype))
332  {
333  if (mimetypes[mimetype])
334  ext = mimetypes[mimetype];
335  if (ext!="file") {
336  sscanf(fname, "%s.file", fname);
337  mv(fname+".file", fname + "." + ext);
338  }
339  }
340  }
341  }
342 
343 }
344 
345 
346 };