File Coverage

blib/lib/MogileFS/Worker/Fsck.pm
Criterion Covered Total %
statement 87 314 27.7
branch 0 102 0.0
condition 0 28 0.0
subroutine 29 49 59.1
pod 0 13 0.0
total 116 506 22.9


line stmt bran cond sub pod time code
1             package MogileFS::Worker::Fsck;
2              
3 21     21   168 use strict;
  21         46  
  21         1025  
4 21     21   144 use base 'MogileFS::Worker';
  21         46  
  21         3652  
5             use fields (
6 21         202 'opt_nostat', # bool: do we trust mogstoreds? skipping size stats?
7             'opt_checksum', # (class|off|MD5) checksum mode
8 21     21   133 );
  21         52  
9 21     21   1495 use MogileFS::Util qw(every error debug);
  21         43  
  21         1885  
10 21     21   133 use MogileFS::Config;
  21         72  
  21         3976  
11 21     21   141 use MogileFS::Server;
  21         62  
  21         514  
12 21     21   138 use List::Util ();
  21         63  
  21         391  
13 21     21   114 use Time::HiRes ();
  21         45  
  21         529  
14              
15 21     21   123 use constant SUCCESS => 0;
  21         47  
  21         1455  
16 21     21   118 use constant TEMPORARY => 1;
  21         58  
  21         961  
17 21     21   111 use constant PERMANENT => 2;
  21         45  
  21         978  
18 21     21   119 use constant REPLICATE => 3;
  21         83  
  21         1145  
19              
20 21     21   208 use constant EV_NO_PATHS => "NOPA";
  21         54  
  21         1195  
21 21     21   123 use constant EV_POLICY_VIOLATION => "POVI";
  21         41  
  21         944  
22 21     21   128 use constant EV_FILE_MISSING => "MISS";
  21         56  
  21         1021  
23 21     21   120 use constant EV_BAD_LENGTH => "BLEN";
  21         1136  
  21         1002  
24 21     21   126 use constant EV_CANT_FIX => "GONE";
  21         41  
  21         917  
25 21     21   110 use constant EV_START_SEARCH => "SRCH";
  21         41  
  21         1000  
26 21     21   116 use constant EV_FOUND_FID => "FOND";
  21         42  
  21         934  
27 21     21   114 use constant EV_RE_REPLICATE => "REPL";
  21         51  
  21         1224  
28 21     21   135 use constant EV_BAD_COUNT => "BCNT";
  21         269  
  21         1069  
29 21     21   126 use constant EV_BAD_CHECKSUM => "BSUM";
  21         46  
  21         1253  
30 21     21   141 use constant EV_NO_CHECKSUM => "NSUM";
  21         55  
  21         953  
31 21     21   115 use constant EV_MULTI_CHECKSUM => "MSUM";
  21         49  
  21         1004  
32 21     21   230 use constant EV_BAD_HASHTYPE => "BALG";
  21         45  
  21         906  
33              
34 21     21   124 use POSIX ();
  21         42  
  21         12381  
35              
36             my $nowish; # approximate unixtime, updated once per loop.
37              
38 0     0 0   sub watchdog_timeout { 120 }
39              
40             sub work {
41 0     0 0   my $self = shift;
42              
43             # this can be CPU-intensive. let's nice ourselves down.
44 0           POSIX::nice(10);
45              
46 0           my $sto = Mgd::get_store();
47 0           my $max_checked = 0;
48              
49             every(2.0, sub {
50 0     0     my $sleep_set = shift;
51 0           $nowish = time();
52 0           local $Mgd::nowish = $nowish;
53              
54 0           my $queue_todo = $self->queue_todo('fsck');
55             # This counts the same as a $self->still_alive;
56 0           $self->send_to_parent('worker_bored 50 fsck');
57 0 0         return unless @{$queue_todo};
  0            
58 0 0         return unless $self->validate_dbh;
59              
60 0           my @fids = ();
61 0           while (my $todo = shift @{$queue_todo}) {
  0            
62 0           my $fid = MogileFS::FID->new($todo->{fid});
63 0 0         if ($fid->exists) {
64 0           push(@fids, $fid);
65             } else {
66             # FID stopped existing before being checked.
67 0           $sto->delete_fid_from_file_to_queue($fid->id, FSCK_QUEUE);
68             }
69             }
70 0 0         return unless @fids;
71              
72 0   0       $self->{opt_nostat} = MogileFS::Config->server_setting('fsck_opt_policy_only') || 0;
73 0           my $alg = MogileFS::Config->server_setting_cached("fsck_checksum");
74 0 0 0       if (defined($alg) && $alg eq "off") {
75 0           $self->{opt_checksum} = "off";
76             } else {
77 0 0         $self->{opt_checksum} = MogileFS::Checksum->valid_alg($alg) ? $alg : 0;
78             }
79 0           MogileFS::FID->mass_load_devids(@fids);
80              
81             # don't sleep in loop, next round, since we found stuff to work on
82             # this round...
83 0           $sleep_set->(0);
84              
85 0           my $new_max;
86 0           my $hit_problem = 0;
87              
88 0           foreach my $fid (@fids) {
89 0 0         if (!$self->check_fid($fid)) {
90             # some connectivity problem... retry this fid later.
91             # (don't dequeue it)
92 0           $self->still_alive;
93 0           next;
94             }
95 0           $sto->delete_fid_from_file_to_queue($fid->id, FSCK_QUEUE);
96             }
97 0           });
98             }
99              
100             # given a $fid (MogileFS::FID, with pre-populated ->devids data)
101             # return 0 if reachability problems.
102             # return 1 if fid was checked (regardless of there being problems or not)
103             # if no problems, no action.
104             # if problems, log & enqueue fixes
105 21     21   134 use constant STALLED => 0;
  21         45  
  21         1091  
106 21     21   120 use constant HANDLED => 1;
  21         52  
  21         19390  
107             sub check_fid {
108 0     0 0   my ($self, $fid) = @_;
109              
110             my $fix = sub {
111             # we cached devids without locking for the fast path,
112             # ensure we get an up-to-date list in the slow path.
113 0     0     $fid->forget_cached_devids;
114              
115 0           my $sto = Mgd::get_store();
116 0 0         unless ($sto->should_begin_replicating_fidid($fid->id)) {
117 0           error("Fsck stalled for fid $fid: failed to acquire lock");
118 0           return STALLED;
119             }
120              
121 0 0         unless ($fid->exists) {
122             # FID stopped existing while doing (or waiting on)
123             # the fast check, give up on this fid
124 0           $sto->note_done_replicating($fid->id);
125 0           return HANDLED;
126             }
127              
128 0           my $fixed = eval { $self->fix_fid($fid) };
  0            
129 0           my $err = $@;
130 0           $sto->note_done_replicating($fid->id);
131 0 0         if (! defined $fixed) {
132 0           error("Fsck stalled for fid $fid: $err");
133 0           return STALLED;
134             }
135 0 0         $fid->fsck_log(EV_CANT_FIX) if ! $fixed;
136              
137             # that might've all taken awhile, let's update our approximate time
138 0           $nowish = $self->still_alive;
139 0           return HANDLED;
140 0           };
141              
142             # first obvious fucked-up case: no devids even presumed to exist.
143 0 0         unless ($fid->devids) {
144             # first, log this weird condition.
145 0           $fid->fsck_log(EV_NO_PATHS);
146              
147             # weird, schedule a fix (which will do a search over all
148             # devices as a last-ditch effort to locate it)
149 0           return $fix->();
150             }
151              
152             # first, see if the assumed devids meet the replication policy for
153             # the fid's class.
154 0 0         unless ($fid->devids_meet_policy) {
155             # log a policy violation
156 0           $fid->fsck_log(EV_POLICY_VIOLATION);
157 0           return $fix->();
158             }
159              
160             # This is a simple fixup case
161             # If we got here, we already know we have no policy violation and
162             # don't need to call $fix->() to just fix a devcount
163 0           $self->maybe_fix_devcount($fid);
164              
165             # missing checksum row
166 0 0 0       if ($fid->class->hashtype && ! $fid->checksum) {
167 0           return $fix->();
168             }
169              
170             # in the fast case, do nothing else (don't check if assumed file
171             # locations are actually there). in the fast case, all we do is
172             # check the replication policy, which is already done, so finish.
173 0 0         return HANDLED if $self->{opt_nostat};
174              
175 0 0 0       if ($self->{opt_checksum} && $self->{opt_checksum} ne "off") {
176 0           return $fix->();
177             }
178              
179             # stat each device to see if it's still there. on first problem,
180             # stop and go into the slow(er) fix function.
181 0           my $err;
182             my $rv = $self->parallel_check_sizes([ $fid->devfids ], sub {
183 0     0     my ($dfid, $disk_size) = @_;
184 0 0         if (! defined $disk_size) {
185 0           my $dev = $dfid->device;
186             # We end up checking is_perm_dead twice, but that's the way the
187             # flow goes...
188 0 0         if ($dev->dstate->is_perm_dead) {
189 0           $err = "needfix";
190 0           return 0;
191             }
192 0           error("Connectivity problem reaching device " . $dev->id . " on host " . $dev->host->ip . "\n");
193 0           $err = "stalled";
194 0           return 0;
195             }
196 0 0         return 1 if $disk_size == $fid->length;
197 0           $err = "needfix";
198             # Note: not doing fsck_log, as fix_fid will log status for each device.
199 0           return 0;
200 0           });
201              
202 0 0         if ($rv) {
    0          
    0          
203 0 0 0       return ($fid->class->hashtype && !($self->{opt_checksum} && $self->{opt_checksum} eq "off"))
204             ? $fix->() : HANDLED;
205             } elsif ($err eq "stalled") {
206 0           return STALLED;
207             } elsif ($err eq "needfix") {
208 0           return $fix->();
209             } else {
210 0           die "Unknown error checking fid sizes in parallel.\n";
211             }
212             }
213              
214             # returns true if all size checks succeeded, false otherwise
215             sub parallel_check_sizes {
216 0     0 0   my ($self, $dflist, $cb) = @_;
217 0           my $expect = scalar @$dflist;
218 0           my ($good, $done) = (0, 0);
219              
220 0           foreach my $df (@$dflist) {
221             $df->size_on_disk(sub {
222 0     0     my ($size) = @_;
223 0           $done++;
224 0 0         $good++ if $cb->($df, $size);
225 0           });
226             }
227              
228 0     0     Danga::Socket->SetPostLoopCallback(sub { $done != $expect });
  0            
229 0           Danga::Socket->EventLoop;
230              
231 0           return $good == $expect;
232             }
233              
234             # this is the slow path. if something above in check_fid finds
235             # something amiss in any way, we went the slow path on a fid and try
236             # really hard to fix the situation.
237             #
238             # return true if situation handled, 0 if nothing could be done.
239             # die on errors (like connectivity problems).
240 21     21   132 use constant CANT_FIX => 0;
  21         45  
  21         61596  
241             sub fix_fid {
242 0     0 0   my ($self, $fid) = @_;
243 0           debug(sprintf("Fixing FID %d", $fid->id));
244              
245             # make devfid objects from the devids that this fid is on,
246 0           my @dfids = map { MogileFS::DevFID->new($_, $fid) } $fid->devids;
  0            
247              
248             # track all known good copies (dev objects), as well as all bad
249             # copies (places it should've been, but isn't)
250 0           my @good_devs;
251             my @bad_devs;
252 0           my %already_checked; # devid -> 1.
253 0   0       my $alg = $fid->class->hashname || $self->{opt_checksum};
254 0           my $checksums = {};
255 0     0     my $ping_cb = sub { $self->still_alive };
  0            
256              
257             my $check_dfids = sub {
258 0     0     my $is_desperate_mode = shift;
259              
260             # stat all devices.
261 0           foreach my $dfid (@dfids) {
262 0           my $dev = $dfid->device;
263 0 0         next if $already_checked{$dev->id}++;
264              
265             # Got a dead link, but reaper hasn't cleared it yet?
266 0 0         if ($dev->dstate->is_perm_dead) {
267 0           push @bad_devs, $dev;
268 0           next;
269             }
270              
271 0           my $disk_size = $dfid->size_on_disk;
272 0 0         die "dev " . $dev->id . " unreachable" unless defined $disk_size;
273              
274 0 0         if ($disk_size == $fid->length) {
275 0 0 0       if ($alg && $alg ne "off") {
276 0           my $digest = $self->checksum_on_disk($dfid, $alg, $ping_cb);
277 0 0         unless (defined $digest) {
278 0           die "dev " . $dev->id . " unreachable";
279             }
280              
281             # DELETE could've hit right after size check
282 0 0         if ($digest eq "-1") {
283 0 0         unless ($is_desperate_mode) {
284 0           $fid->fsck_log(EV_FILE_MISSING, $dev);
285             }
286 0           push @bad_devs, $dfid->device;
287 0           next;
288             }
289 0   0       push @{$checksums->{$digest} ||= []}, $dfid->device;
  0            
290             }
291              
292 0           push @good_devs, $dfid->device;
293             # if we were doing a desperate search, one is enough, we can stop now!
294 0 0         return if $is_desperate_mode;
295 0           next;
296             }
297              
298             # don't log in desperate mode, as we'd have "file missing!" log entries
299             # for every device in the normal case, which is expected.
300 0 0         unless ($is_desperate_mode) {
301 0 0         if ($disk_size == -1) {
302 0           $fid->fsck_log(EV_FILE_MISSING, $dev);
303             } else {
304 0           $fid->fsck_log(EV_BAD_LENGTH, $dev);
305             }
306             }
307              
308 0           push @bad_devs, $dfid->device;
309             }
310 0           };
311              
312 0           $check_dfids->();
313              
314             # if we didn't find it anywhere, let's go do an exhaustive search over
315             # all devices, looking for it...
316 0 0         unless (@good_devs) {
317             # replace @dfids with list of all (alive) devices. dups will be ignored by
318             # check_dfids
319 0           $fid->fsck_log(EV_START_SEARCH);
320 0           @dfids = List::Util::shuffle(
321 0           map { MogileFS::DevFID->new($_, $fid) }
322 0           grep { $_->dstate->should_fsck_search_on }
323             Mgd::device_factory()->get_all
324             );
325 0           $check_dfids->("desperate");
326              
327             # still can't fix it?
328 0 0         unless (@good_devs) {
329 0           $self->forget_bad_devs($fid, @bad_devs);
330 0           $fid->update_devcount;
331 0           return CANT_FIX;
332             }
333              
334             # wow, we actually found it!
335 0           $fid->fsck_log(EV_FOUND_FID);
336 0           $fid->note_on_device($good_devs[0]); # at least one good one.
337              
338             # fall through to check policy (which will most likely be
339             # wrong, with only one file_on record...) and re-replicate
340             }
341              
342 0           $self->forget_bad_devs($fid, @bad_devs);
343             # in case the devcount or similar was fixed.
344 0           $fid->want_reload;
345              
346 0 0 0       $self->fix_checksums($fid, $alg, $checksums) if $alg && $alg ne "off";
347              
348             # Note: this will reload devids, if they called 'note_on_device'
349             # or 'forget_about_device'
350 0 0         unless ($fid->devids_meet_policy) {
351 0           $fid->enqueue_for_replication(in => 1);
352 0           $fid->fsck_log(EV_RE_REPLICATE);
353 0           return HANDLED;
354             }
355            
356             # Clean up the device count if it's wrong
357 0           $self->maybe_fix_devcount($fid);
358              
359 0           return HANDLED;
360             }
361              
362             sub forget_file_on_with_bad_checksums {
363 0     0 0   my ($self, $fid, $checksums) = @_;
364 0           foreach my $bdevs (values %$checksums) {
365 0           foreach my $bdev (@$bdevs) {
366 0           error("removing file_on mapping for fid=" . $fid->id . ", dev=" . $bdev->id);
367 0           $fid->forget_about_device($bdev);
368             }
369             }
370             }
371              
372             # returns -1 on missing,
373             # undef on connectivity error,
374             # else checksum of file on disk (after HTTP GET or mogstored read)
375             sub checksum_on_disk {
376 0     0 0   my ($self, $dfid, $alg, $ping_cb) = @_;
377 0           return $dfid->checksum_on_disk($alg, $ping_cb, "fsck");
378             }
379              
380             sub bad_checksums_errmsg {
381 0     0 0   my ($self, $alg, $checksums) = @_;
382 0           my @err;
383              
384 0           foreach my $checksum (keys %$checksums) {
385 0           my $bdevs = join(",", map { $_->id } @{$checksums->{$checksum}});
  0            
  0            
386 0           $checksum = unpack("H*", $checksum);
387 0           push @err, "$alg:$checksum on devids=[$bdevs]"
388             }
389              
390 0           return join('; ', @err);
391             }
392              
393             # we don't now what checksum the file is supposed to be, but some
394             # of the devices had checksums that didn't match the other(s).
395             sub auto_checksums_bad {
396 0     0 0   my ($self, $fid, $checksums) = @_;
397 0           my $alg = $self->{opt_checksum};
398 0           my $err = $self->bad_checksums_errmsg($alg, $checksums);
399              
400 0           error("$fid has multiple checksums: $err");
401 0           $fid->fsck_log(EV_MULTI_CHECKSUM);
402             }
403              
404             sub all_checksums_bad {
405 0     0 0   my ($self, $fid, $checksums) = @_;
406 0 0         my $alg = $fid->class->hashname or return; # class could've changed
407 0           my $cur_checksum = $fid->checksum;
408 0           my $err = $self->bad_checksums_errmsg($alg, $checksums);
409 0 0         my $cur = $cur_checksum ? "Expected: $cur_checksum"
410             : "No known valid checksum";
411 0           error("all checksums bad: $err. $cur");
412 0           $fid->fsck_log(EV_BAD_CHECKSUM);
413             }
414              
415             sub fix_checksums {
416 0     0 0   my ($self, $fid, $alg, $checksums) = @_;
417 0           my $cur_checksum = $fid->checksum;
418 0           my @all_checksums = keys(%$checksums);
419              
420 0 0         if (scalar(@all_checksums) == 1) { # all checksums match, good!
    0          
    0          
421 0           my $disk_checksum = $all_checksums[0];
422 0 0         if ($cur_checksum) {
423 0 0         if ($cur_checksum->{checksum} ne $disk_checksum) {
424 0           my $expect = $cur_checksum->info;
425 0           my $actual = "$alg:" . unpack("H*", $disk_checksum);
426 0           error("$cur_checksum does not match disk: $actual");
427 0 0         if ($alg ne $cur_checksum->hashname) {
428 0           $fid->fsck_log(EV_BAD_HASHTYPE);
429             } else {
430 0           $fid->fsck_log(EV_BAD_CHECKSUM);
431             }
432             }
433             } else { # fresh row to checksum
434 0           my $hashtype = $fid->class->hashtype;
435              
436             # we store this in the database
437 0 0         if ($hashtype) {
438 0           my %row = (
439             fid => $fid->id,
440             checksum => $disk_checksum,
441             hashtype => $hashtype,
442             );
443 0           my $new_checksum = MogileFS::Checksum->new(\%row);
444 0           debug("creating new checksum=$new_checksum");
445 0           $fid->fsck_log(EV_NO_CHECKSUM);
446 0           $new_checksum->save;
447             } else {
448 0           my $hex_checksum = unpack("H*", $disk_checksum);
449 0           my $alg = $self->{opt_checksum};
450 0           debug("fsck_checksum=auto good: $fid $alg:$hex_checksum");
451             }
452             }
453             } elsif ($cur_checksum) {
454 0           my $good = delete($checksums->{$cur_checksum->{checksum}});
455 0 0 0       if ($good && (scalar(@$good) > 0)) {
456 0           $self->forget_file_on_with_bad_checksums($fid, $checksums);
457             # will fail $fid->devids_meet_policy and re-replicate
458             } else {
459 0           $self->all_checksums_bad($fid, $checksums);
460             }
461             } elsif ($self->{opt_checksum}) {
462 0           $self->auto_checksums_bad($fid, $checksums);
463             } else {
464 0           $self->all_checksums_bad($fid, $checksums);
465             }
466             }
467              
468             # remove the file_on mappings for devices that were bogus/missing.
469             sub forget_bad_devs {
470 0     0 0   my ($self, $fid, @bad_devs) = @_;
471 0           foreach my $bdev (@bad_devs) {
472 0           error("removing file_on mapping for fid=" . $fid->id . ", dev=" . $bdev->id);
473 0           $fid->forget_about_device($bdev);
474             }
475             }
476              
477             sub maybe_fix_devcount {
478             # don't even log BCNT errors if skip_devcount is enabled
479 0 0   0 0   return if MogileFS::Config->server_setting_cached('skip_devcount');
480              
481 0           my ($self, $fid) = @_;
482 0 0         return if scalar($fid->devids) == $fid->devcount;
483             # log a bad count
484 0           $fid->fsck_log(EV_BAD_COUNT);
485 0           $fid->update_devcount();
486             }
487              
488             1;
489              
490             # Local Variables:
491             # mode: perl
492             # c-basic-indent: 4
493             # indent-tabs-mode: nil
494             # End: