i'm looking for dxml library, with could be a lxml fork {http://lxml.de/}, dscrap lib like scrapy project fork but with dynamic web scraping page in mind!
trying to write code like that in python
python scrapy scraper
/*********************************
import scrapy
from scrapy.http import FormRequest, Request
from investing.items import InvestingItem
#from selenium import webdriver
import time #used for the sleep() function
from datetime import datetime
import logging
log = logging.getLogger('investing')
class InvestingSpider(scrapy.Spider):
name = "investing"
allowed_domains = ["investing.com"]
start_urls = ('http://www.investing.com/commodities/gold',)
#def __init__(self):
#self.driver = webdriver.Firefox()
#self.driver.get('http://www.investing.com/commodities/gold')
#time.sleep(0.5) # tried pausing, in case problem was delayed loading - didn't work
def parse(self, response):
item = InvestingItem()
log.info('Parse function called on %s' , response.url)
name = response.xpath('//*[@id="leftColumn"]/div[1]/h1/text()').extract()[0]
log.info('name(%s)' % name)
item['name'] = name
vl = response.xpath('//*[@id="last_last"]/text()').extract()[0]
log.info('vl(%s)' % vl)
item['vl'] = vl
time = datetime.now()
log.info('time(%s)' % time)
item['time'] = datetime.now()
yield item
yield scrapy.Request(response.url, callback=self.parse)
#with open("gold.json", 'wb') as f:
#f.write(item)
#self.driver.close()
*******************************************************************//
could be an easy tool
an easy way to get feed data from scraper and quicky thread scraper in D
cheers for chrismas ;)
RADIO RAMAX