J'ai une petite application de grattage et j'essaie d'y ajouter du multithreading. Voici le code (MyMech est WWW :: sous-classe Mechanize utilisé pour traiter les erreurs HTTP):L'application Threads se termine de manière inattendue
#!/usr/bin/perl
use strict;
use MyMech;
use File::Basename;
use File::Path;
use HTML::Entities;
use threads;
use threads::shared;
use Thread::Queue;
use List::Util qw(max sum);
my $page = 1;
my %CONFIG = read_config();
my $mech = MyMech->new(autocheck => 1);
$mech->quiet(0);
$mech->get($CONFIG{BASE_URL} . "/site-map.php");
my @championship_links =
$mech->find_all_links(url_regex => qr/\d{4}-\d{4}\/$/);
foreach my $championship_link (@championship_links) {
my @threads;
my $queue = Thread::Queue->new;
my $queue_processed = Thread::Queue->new;
my $url = sprintf $championship_link->url_abs();
print $url, "\n";
next unless $url =~ m{soccer}i;
$mech->get($url);
my ($last_round_loaded, $current_round) =
find_current_round($mech->content());
unless ($last_round_loaded) {
print "\tLoading rounds data...\n";
$mech->submit_form(
form_id => "leagueForm",
fields => {
round => $current_round,
},
);
}
my @match_links =
$mech->find_all_links(url_regex => qr/matchdetails\.php\?matchid=\d+$/);
foreach my $link (@match_links) {
$queue->enqueue($link);
}
print "Starting printing thread...\n";
my $printing_thread = threads->create(
sub { printing_thread(scalar(@match_links), $queue_processed) })
->detach;
push @threads, $printing_thread;
print "Starting threads...\n";
foreach my $thread_id (1 .. $CONFIG{NUMBER_OF_THREADS}) {
my $thread = threads->create(
sub { scrape_match($thread_id, $queue, $queue_processed) })
->join;
push @threads, $thread;
}
undef $queue;
undef $queue_processed;
foreach my $thread (threads->list()) {
if ($thread->is_running()) {
print $thread->tid(), "\n";
}
}
#sleep 5;
}
print "Finished!\n";
sub printing_thread {
my ($number_of_matches, $queue_processed) = @_;
my @fields =
qw (
championship
year
receiving_team
visiting_team
score
average_home
average_draw
average_away
max_home
max_draw
max_away
date
url
);
while ($number_of_matches) {
if (my $match = $queue_processed->dequeue_nb) {
open my $fh, ">>:encoding(UTF-8)", $CONFIG{RESULT_FILE} or die $!;
print $fh join("\t", @{$match}{@fields}), "\n";
close $fh;
$number_of_matches--;
}
}
threads->exit();
}
sub scrape_match {
my ($thread_id, $queue, $queue_processed) = @_;
while (my $match_link = $queue->dequeue_nb) {
my $url = sprintf $match_link->url_abs();
print "\t$url", "\n";
my $mech = MyMech->new(autocheck => 1);
$mech->quiet(0);
$mech->get($url);
my $match = parse_match($mech->content());
$match->{url} = $url;
$queue_processed->enqueue($match);
}
return 1;
}
et moi avons des choses étranges avec ce code. Parfois, il s'exécute mais parfois il sort sans erreurs (au point ->detach
). Je sais que @match_links contient des données mais les threads ne sont pas créés et ils se ferment juste. Habituellement, il se termine après le traitement de la deuxième entrée $championship_link
.
Peut-être que je fais quelque chose de mal?
Mise à jour Voici le code pour sous-programme find_current_round
(mais je suis sûr que ce n'est pas lié à la question):
sub find_current_round {
my ($html) = @_;
my ($select_html) = $html =~ m{
<select\s+name="round"[^>]+>\s*
(.+?)
</select>
}isx;
my ($option_html, $current_round) = $select_html =~ m{
(<option\s+value="\d+"(?:\s+ selected="selected")?>(\d+)</option>)\Z
}isx;
my ($last_round_loaded) = $option_html =~ m{selected};
return ($last_round_loaded, $current_round);
}
Il manque le sous-programme 'find_current_round'. Pourriez-vous le poster aussi? – Zaid
@Zaid: J'ai posté du code pour find_current_round. – gangabass