Je suis en train de traiter 100k noms de domaine dans un fichier CSV basé sur les résultats de Siteadvisor en utilisant urllib (pas la meilleure méthode, je sais). Cependant, mon script actuel crée trop de threads et Python se heurte à des erreurs. Existe-t-il un moyen de "morceler" ce script pour faire X nombre de domaines à la fois (par exemple, 10-20) pour éviter ces erreurs? Merci d'avance.Comment puis-je diviser ce script python multithread en "morceaux"?
import threading
import urllib
class Resolver(threading.Thread):
def __init__(self, address, result_dict):
threading.Thread.__init__(self)
self.address = address
self.result_dict = result_dict
def run(self):
try:
content = urllib.urlopen("http://www.siteadvisor.com/sites/" + self.address).read(12000)
search1 = content.find("didn't find any significant problems.")
search2 = content.find('yellow')
search3 = content.find('web reputation analysis found potential security')
search4 = content.find("don't have the results yet.")
if search1 != -1:
result = "safe"
elif search2 != -1:
result = "caution"
elif search3 != -1:
result = "warning"
elif search4 != -1:
result = "unknown"
else:
result = ""
self.result_dict[self.address] = result
except:
pass
def main():
infile = open("domainslist", "r")
intext = infile.readlines()
threads = []
results = {}
for address in [address.strip() for address in intext if address.strip()]:
resolver_thread = Resolver(address, results)
threads.append(resolver_thread)
resolver_thread.start()
for thread in threads:
thread.join()
outfile = open('final.csv', 'w')
outfile.write("\n".join("%s,%s" % (address, ip) for address, ip in results.iteritems()))
outfile.close()
if __name__ == '__main__':
main()
Modifier: nouvelle version, basée sur les suggestions de andyortlieb.
import threading
import urllib
import time
class Resolver(threading.Thread):
def __init__(self, address, result_dict, threads):
threading.Thread.__init__(self)
self.address = address
self.result_dict = result_dict
self.threads = threads
def run(self):
try:
content = urllib.urlopen("http://www.siteadvisor.com/sites/" + self.address).read(12000)
search1 = content.find("didn't find any significant problems.")
search2 = content.find('yellow')
search3 = content.find('web reputation analysis found potential security')
search4 = content.find("don't have the results yet.")
if search1 != -1:
result = "safe"
elif search2 != -1:
result = "caution"
elif search3 != -1:
result = "warning"
elif search4 != -1:
result = "unknown"
else:
result = ""
self.result_dict[self.address] = result
outfile = open('final.csv', 'a')
outfile.write(self.address + "," + result + "\n")
outfile.close()
print self.address + result
threads.remove(self)
except:
pass
def main():
infile = open("domainslist", "r")
intext = infile.readlines()
threads = []
results = {}
for address in [address.strip() for address in intext if address.strip()]:
loop=True
while loop:
if len(threads) < 20:
resolver_thread = Resolver(address, results, threads)
threads.append(resolver_thread)
resolver_thread.start()
loop=False
else:
time.sleep(.25)
for thread in threads:
thread.join()
# removed so I can track the progress of the script
# outfile = open('final.csv', 'w')
# outfile.write("\n".join("%s,%s" % (address, ip) for address, ip in results.iteritems()))
# outfile.close()
if __name__ == '__main__':
main()
Merci pour votre aide jusqu'à présent. J'ai mis en place vos modifications. Cependant, le script n'atteint que 20 domaines. J'ai mis mon script ci-dessus. Sais-tu quel est le problème? – Tom
Je crois que tout ce dont vous avez besoin est de changer de threads.remove (self) à self.threads.remove (auto-) – andyortlieb
Facepalm. Je n'ai pas vu ça. Merci de votre aide. – Tom