#nohup scrapy runspider spider.py > urls.log & import scrapy class MySpider(scrapy.Spider): name = 'domain' allowed_domains = ['domain'] start_urls = [ 'https://domain/' ] #avoid repeat tempList = [] def parse(self, response): # for h3 in response.xpath('//h3').getall(): # yield {"title": h3} for link in response.xpath('//a/@href').getall(): if link not in self.tempList: print(link) self.tempList.append(link) yield scrapy.Request(response.urljoin(link), self.parse)