RejectedSoftware Forums

Sign up

dxml dscrap and xpath

i'm looking for dxml library, with could be a lxml fork {http://lxml.de/}, dscrap lib like scrapy project fork but with dynamic web scraping page in mind!

trying to write code like that in python

python scrapy scraper
/*********************************
import scrapy
from scrapy.http import FormRequest, Request
from investing.items import InvestingItem
#from selenium import webdriver
import time #used for the sleep() function
from datetime import datetime
import logging
log = logging.getLogger('investing')
class InvestingSpider(scrapy.Spider):

name = "investing"
allowed_domains = ["investing.com"]
start_urls = ('http://www.investing.com/commodities/gold',)

#def __init__(self):
	#self.driver = webdriver.Firefox()
	#self.driver.get('http://www.investing.com/commodities/gold')
	#time.sleep(0.5) # tried pausing, in case problem was delayed loading - didn't work

def parse(self, response):
	item = InvestingItem()
	log.info('Parse function called on %s' , response.url)
	
	name = response.xpath('//*[@id="leftColumn"]/div[1]/h1/text()').extract()[0]
	log.info('name(%s)' % name)
	item['name'] = name
	
	vl = response.xpath('//*[@id="last_last"]/text()').extract()[0]
	log.info('vl(%s)' % vl)
	item['vl'] = vl
	
	time = datetime.now()
	log.info('time(%s)' % time)
	item['time'] = datetime.now()
	yield item
	
	yield scrapy.Request(response.url, callback=self.parse)
	#with open("gold.json", 'wb') as f:
		#f.write(item)
		#self.driver.close()

*******************************************************************//
could be an easy tool

an easy way to get feed data from scraper and quicky thread scraper in D

cheers for chrismas ;)

RADIO RAMAX

Re: dxml dscrap and xpath

On Sat, 29 Oct 2016 15:05:12 GMT, radioramax wrote:

i'm looking for dxml library, with could be a lxml fork {http://lxml.de/}, dscrap lib like scrapy project fork but with dynamic web scraping page in mind!

trying to write code like that in python

python scrapy scraper
/*********************************
import scrapy
from scrapy.http import FormRequest, Request
from investing.items import InvestingItem
#from selenium import webdriver
import time #used for the sleep() function
from datetime import datetime
import logging
log = logging.getLogger('investing')
class InvestingSpider(scrapy.Spider):

name = "investing"
allowed_domains = ["investing.com"]
start_urls = ('http://www.investing.com/commodities/gold',)

#def __init__(self):
	#self.driver = webdriver.Firefox()
	#self.driver.get('http://www.investing.com/commodities/gold')
	#time.sleep(0.5) # tried pausing, in case problem was delayed loading - didn't work

def parse(self, response):
	item = InvestingItem()
	log.info('Parse function called on %s' , response.url)
	
	name = response.xpath('//*[@id="leftColumn"]/div[1]/h1/text()').extract()[0]
	log.info('name(%s)' % name)
	item['name'] = name
	
	vl = response.xpath('//*[@id="last_last"]/text()').extract()[0]
	log.info('vl(%s)' % vl)
	item['vl'] = vl
	
	time = datetime.now()
	log.info('time(%s)' % time)
	item['time'] = datetime.now()
	yield item
	
	yield scrapy.Request(response.url, callback=self.parse)
	#with open("gold.json", 'wb') as f:
		#f.write(item)
		#self.driver.close()

*******************************************************************//
could be an easy tool

an easy way to get feed data from scraper and quicky thread scraper in D

cheers for chrismas ;)

RADIO RAMAX

The http://code.dlang.org/packages/std-experimental-xml package, the designated successor to std.xml, I heard was supposed to get an XPath module, but it currently doesn't. However, there is also http://code.dlang.org/packages/kxml (never tried it), which has a parseXPath method for its DOM.