- Notifications
You must be signed in to change notification settings - Fork20
Extracting and parsing structured data with jQuery Selector, XPath or JsonPath from common web format like HTML, XML and JSON.
License
fivesmallq/web-data-extractor
Folders and files
| Name | Name | Last commit message | Last commit date | |
|---|---|---|---|---|
Repository files navigation
Extracting and parsing structured data with Jquery Selector, XPath or JsonPath from common web format like HTML, XML and JSON.
Implements:
#UsageTo add a dependency on Web-Data-Extractor using Maven, use the following:
<dependency> <groupId>im.nll.data</groupId> <artifactId>extractor</artifactId> <version>0.9.6</version></dependency>
To add a dependency using Gradle:
dependencies { compile 'im.nll.data:extractor:0.9.6'}#Examples
###extract single data
Stringfollowers =Extractors.on(baseHtml) .extract(newSelectorExtractor("div.followers")) .with(newRegexExtractor("\\d+")) .asString();
or use static method
Stringfollowers =Extractors.on(baseHtml) .extract(selector("div.followers")) .with(regex("\\d+")) .asString();
or short string
Stringfollowers =Extractors.on(baseHtml) .extract("selector:div.followers")) .with(regex("\\d+")) .asString();
more method
Stringyear =Extractors.on("<div> Talk is cheap. Show me the code. - Fri, 25 Aug 2000 </div>") .extract(selector("div"))// extract with selector .filter(value ->value.trim())// trim result .with(regex("20\\d{2}"))// get year with regex .filter(value ->"from " +value)// append 'from' string .asString();Assert.assertEquals("from 2000",year);
###extract data to map
@TestpublicvoidtestToMap()throwsException {Map<String,String>dataMap =Extractors.on(baseHtml) .extract("title",selector("a.title")) .extract("followers",selector("div.followers")).with(regex("\\d+")) .extract("description",selector("div.description")) .asMap();Assert.assertEquals("fivesmallq",dataMap.get("title"));Assert.assertEquals("29671",dataMap.get("followers"));Assert.assertEquals("Talk is cheap. Show me the code.",dataMap.get("description")); }
@TestpublicvoidtestToMapList()throwsException {//split param must implements ListableExtractorList<Map<String,String>>languages =Extractors.on(listHtml) .split(selector("tr.item.html")) .extract("type",selector("td.type")) .extract("name",selector("td.name")) .extract("url",selector("td.url")) .asMapList();Assert.assertNotNull(languages);Map<String,String>second =languages.get(1);Assert.assertEquals(languages.size(),3);Assert.assertEquals(second.get("type"),"dynamic");Assert.assertEquals(second.get("name"),"Ruby");Assert.assertEquals(second.get("url"),"https://www.ruby-lang.org"); }
###extract data to bean
@TestpublicvoidtestToBean()throwsException {Basebase =Extractors.on(baseHtml) .extract("title",selector("a.title")) .extract("followers",selector("div.followers")).with(regex("\\d+")) .extract("description",selector("div.description")) .asBean(Base.class);Assert.assertEquals("fivesmallq",base.getTitle());Assert.assertEquals("29671",base.getFollowers());Assert.assertEquals("Talk is cheap. Show me the code.",base.getDescription()); }
###extract data to bean list
@TestpublicvoidtestToBeanList()throwsException {List<Language>languages =Extractors.on(listHtml) .split(selector("tr.item.html")) .extract("type",selector("td.type")) .extract("name",selector("td.name")) .extract("url",selector("td.url")) .asBeanList(Language.class);Assert.assertNotNull(languages);Languagesecond =languages.get(1);Assert.assertEquals(languages.size(),3);Assert.assertEquals(second.getType(),"dynamic");Assert.assertEquals(second.getName(),"Ruby");Assert.assertEquals(second.getUrl(),"https://www.ruby-lang.org"); }
###support Embeddable beanset embeddable field value byembeddable.fieldName
@TestpublicvoidtestEmbeddable() {List<Activity>activities =Extractors.on(base5Xml) .split(xpath("//ProcessDefinition/activity").removeNamespace()) .extract("name",xpath("//activity/@name")) .extract("type",xpath("//activity/type/text()")) .extract("resourceType",xpath("//activity/resourceType/text()")) .extract("config.encoding",xpath("//activity/config/encoding/text()")) .extract("config.compressFile",xpath("//activity/config/compressFile/text()")) .extract("inputBindings.fileName",xpath("//activity/inputBindings/WriteActivityInputTextClass/fileName/value-of/@select")) .extract("inputBindings.textContent",xpath("//activity/inputBindings/WriteActivityInputTextClass/textContent/value-of/@select")) .asBeanList(Activity.class);Assert.assertNotNull(activities);Assert.assertEquals(1,activities.size());Activityactivity =activities.get(0);Assert.assertEquals("Output1",activity.getName());Assert.assertEquals("com.tibco.plugin.file.FileWriteActivity",activity.getType());//configConfigconfig =activity.getConfig();Assert.assertEquals("text",config.getEncoding());Assert.assertEquals("None",config.getCompressFile());//bindBindingSpecbindingSpec =activity.getInputBindings();Assert.assertEquals("$_globalVariables/ns:GlobalVariables/GlobalVariables/OutputLocation",bindingSpec.getFileName());Assert.assertEquals("$File-Poller/pfx:EventSourceOuputTextClass/fileContent/textContent",bindingSpec.getTextContent()); }
###filterbefore andafter is the global filter.
@TestpublicvoidtestToBeanListFilterBeforeAndAfter()throwsException {List<Language>languages =Extractors.on(listHtml)//before and after just process the extract value, then execute the follow filter method. .before(value ->"|before|" +value) .after(value ->value +"|after|") .split(xpath("//tr[@class='item']")) .extract("type",xpath("//td[1]/text()")).filter(value ->"filter:" +value) .extract("name",xpath("//td[2]/text()")).filter(value ->"filter:" +value) .extract("url",xpath("//td[3]/text()")).filter(value ->"filter:" +value) .asBeanList(Language.class);Assert.assertNotNull(languages);Languagesecond =languages.get(1);Assert.assertEquals(languages.size(),3);Assert.assertEquals(second.getType(),"filter:|before|dynamic|after|");Assert.assertEquals(second.getName(),"filter:|before|Ruby|after|");Assert.assertEquals(second.getUrl(),"filter:|before|https://www.ruby-lang.org|after|"); }
seeExample
#Contributing
Bug reports and pull requests are welcome on GitHub athttps://github.com/fivesmallq/web-data-extractor.
About
Extracting and parsing structured data with jQuery Selector, XPath or JsonPath from common web format like HTML, XML and JSON.
Topics
Resources
License
Uh oh!
There was an error while loading.Please reload this page.
Stars
Watchers
Forks
Packages0
Uh oh!
There was an error while loading.Please reload this page.
Contributors3
Uh oh!
There was an error while loading.Please reload this page.