#nohup scrapy runspider spider.py > urls.log &
import scrapy
class MySpider(scrapy.Spider):
name = 'domain'
allowed_domains = ['domain']
start_urls = [
'https://domain/'
]
#avoid repeat
tempList = []
def parse(self, response):
# for h3 in response.xpath('//h3').getall():
# yield {"title": h3}
for link in response.xpath('//a/@href').getall():
if link not in self.tempList:
print(link)
self.tempList.append(link)
yield scrapy.Request(response.urljoin(link), self.parse)