Skip to content

Latest commit

 

History

History
1192 lines (598 loc) · 21.2 KB

File metadata and controls

1192 lines (598 loc) · 21.2 KB

downflux


downflux / WikimediaParser

Class: WikimediaParser

Defined in: packages/providers/wikimedia/WikimediaParser.ts:7

Default HTML parser shared by provider-specific parsers.

Remarks

Parsers exist to keep extraction rules close to the HTML they understand. The base parser collects common page fields such as anchors, images, meta tags, and source URLs, while provider parsers add the site-specific fields needed by transformers and pipelines.

Extends

Constructors

Constructor

new WikimediaParser(): WikimediaParser

Returns

WikimediaParser

Inherited from

BaseParser.constructor

Properties

kvsResolver

protected kvsResolver: KvsResolver

Defined in: packages/base/BaseParser.ts:16

Inherited from

BaseParser.kvsResolver

Methods

extractScriptMethodInput()

protected extractScriptMethodInput(fnName, html): string | null

Defined in: packages/base/BaseParser.ts:54

Extracts the first string argument passed to a named script function.

Parameters

fnName

string

Function name to search for.

html

string

HTML or script text to inspect.

Returns

string | null

The first string argument, or null when the call is absent.

Inherited from

BaseParser.extractScriptMethodInput


getFlashVars()

protected getFlashVars(html): FlashVarsOutput

Defined in: packages/base/BaseParser.ts:66

Extracts KVS flashVars video metadata from inline scripts.

Parameters

html

string

HTML containing one or more KVS flashVars blocks.

Returns

FlashVarsOutput

Normalized KVS fields, video sources, previews, and timelines.

Inherited from

BaseParser.getFlashVars


extractElementText()

protected extractElementText(html, begin, end, fallback?): string

Defined in: packages/base/BaseParser.ts:190

Parameters

html

string

begin

string

end

string

fallback?

string = ''

Returns

string

Inherited from

BaseParser.extractElementText


extractElementTextPair()

protected extractElementTextPair(html, begin, end, pos?): [string | null, number]

Defined in: packages/base/BaseParser.ts:199

Parameters

html

string

begin

string

end

string

pos?

number = 0

Returns

[string | null, number]

Inherited from

BaseParser.extractElementTextPair


extractAllPairs()

protected extractAllPairs(html, begin, end): Generator<string>

Defined in: packages/base/BaseParser.ts:208

Parameters

html

string

begin

string

end

string

Returns

Generator<string>

Inherited from

BaseParser.extractAllPairs


extractAll()

protected extractAll(html, rules, startPos?): [Record<string, string>, number]

Defined in: packages/base/BaseParser.ts:228

Parameters

html

string

rules

[string, string, string][]

startPos?

number = 0

Returns

[Record<string, string>, number]

Inherited from

BaseParser.extractAll


extractAnchors()

protected extractAnchors(html, sourceUrl?): string[]

Defined in: packages/base/BaseParser.ts:243

Parameters

html

string

sourceUrl?

string

Returns

string[]

Inherited from

BaseParser.extractAnchors


extractAnchorTextsByHref()

protected extractAnchorTextsByHref(html, hrefPattern): string[]

Defined in: packages/base/BaseParser.ts:258

Parameters

html

string

hrefPattern

RegExp

Returns

string[]

Inherited from

BaseParser.extractAnchorTextsByHref


extractImageUrls()

protected extractImageUrls(html, sourceUrl?): string[]

Defined in: packages/base/BaseParser.ts:277

Parameters

html

string

sourceUrl?

string

Returns

string[]

Inherited from

BaseParser.extractImageUrls


extractSourceUrls()

protected extractSourceUrls(html, sourceUrl?): string[]

Defined in: packages/base/BaseParser.ts:316

Parameters

html

string

sourceUrl?

string

Returns

string[]

Inherited from

BaseParser.extractSourceUrls


getFlashVarsVideo()

protected getFlashVarsVideo(html, sourceUrl, uploader?, starred?): DefaultFlashVarsVideoOutput

Defined in: packages/base/BaseParser.ts:333

Parameters

html

string

sourceUrl

string

uploader?

string

starred?

string[]

Returns

DefaultFlashVarsVideoOutput

Inherited from

BaseParser.getFlashVarsVideo


collectElements()

protected collectElements(html, type, className?): Record<string, string>[]

Defined in: packages/base/BaseParser.ts:356

Parameters

html

string

type

string

className?

string

Returns

Record<string, string>[]

Inherited from

BaseParser.collectElements


extractVideoPosters()

protected extractVideoPosters(html, sourceUrl?): string[]

Defined in: packages/base/BaseParser.ts:377

Parameters

html

string

sourceUrl?

string

Returns

string[]

Inherited from

BaseParser.extractVideoPosters


extractDivHrefs()

protected extractDivHrefs(html, sourceUrl?): string[]

Defined in: packages/base/BaseParser.ts:394

Parameters

html

string

sourceUrl?

string

Returns

string[]

Inherited from

BaseParser.extractDivHrefs


extractVideoUrls()

protected extractVideoUrls(html): string[]

Defined in: packages/base/BaseParser.ts:405

Parameters

html

string

Returns

string[]

Inherited from

BaseParser.extractVideoUrls


extractAllUrls()

protected extractAllUrls(html): string[]

Defined in: packages/base/BaseParser.ts:415

Parameters

html

string

Returns

string[]

Inherited from

BaseParser.extractAllUrls


extractLinks()

protected extractLinks(html): string[]

Defined in: packages/base/BaseParser.ts:419

Parameters

html

string

Returns

string[]

Inherited from

BaseParser.extractLinks


extractMetaDescription()

protected extractMetaDescription(html): string

Defined in: packages/base/BaseParser.ts:431

Parameters

html

string

Returns

string

Inherited from

BaseParser.extractMetaDescription


extractMetaNameContent()

protected extractMetaNameContent(html, value): string

Defined in: packages/base/BaseParser.ts:439

Parameters

html

string

value

string

Returns

string

Inherited from

BaseParser.extractMetaNameContent


extractMetaPropertyContent()

protected extractMetaPropertyContent(html, value): string

Defined in: packages/base/BaseParser.ts:447

Parameters

html

string

value

string

Returns

string

Inherited from

BaseParser.extractMetaPropertyContent


collectAnchors()

protected collectAnchors(html, options?): object[]

Defined in: packages/base/BaseParser.ts:455

Parameters

html

string

options?
sourceUrl?

string

className?

string

hrefPattern?

RegExp

Returns

object[]

Inherited from

BaseParser.collectAnchors


extractMetaKeywords()

protected extractMetaKeywords(html): string[]

Defined in: packages/base/BaseParser.ts:507

Parameters

html

string

Returns

string[]

Inherited from

BaseParser.extractMetaKeywords


extractTitle()

protected extractTitle(html): string

Defined in: packages/base/BaseParser.ts:517

Parameters

html

string

Returns

string

Inherited from

BaseParser.extractTitle


resolveUrl()

protected resolveUrl(raw, base?): string | null

Defined in: packages/base/BaseParser.ts:521

Parameters

raw

string

base?

string

Returns

string | null

Inherited from

BaseParser.resolveUrl


isHttpUrl()

protected isHttpUrl(url?): url is string

Defined in: packages/base/BaseParser.ts:530

Parameters

url?

string | null

Returns

url is string

Inherited from

BaseParser.isHttpUrl


decodeHtmlEntities()

protected decodeHtmlEntities(str): string

Defined in: packages/base/BaseParser.ts:534

Parameters

str

string

Returns

string

Inherited from

BaseParser.decodeHtmlEntities


extractByTag()

protected extractByTag(html, tag, options?): string[]

Defined in: packages/base/BaseParser.ts:545

Parameters

html

string

tag

string

options?
className?

string

attribute?

string

Returns

string[]

Inherited from

BaseParser.extractByTag


extractOneByTag()

protected extractOneByTag(html, tag, options?): string | null

Defined in: packages/base/BaseParser.ts:571

Parameters

html

string

tag

string

options?
className?

string

Returns

string | null

Inherited from

BaseParser.extractOneByTag


extractScriptsByType()

protected extractScriptsByType(html, type, objectType?): Record<string, any>[]

Defined in: packages/base/BaseParser.ts:575

Parameters

html

string

type

string

objectType?

string

Returns

Record<string, any>[]

Inherited from

BaseParser.extractScriptsByType


extractByClass()

protected extractByClass(html, className): string[]

Defined in: packages/base/BaseParser.ts:595

Parameters

html

string

className

string

Returns

string[]

Inherited from

BaseParser.extractByClass


extractAttributes()

protected extractAttributes(html, tag, attr): string[]

Defined in: packages/base/BaseParser.ts:609

Parameters

html

string

tag

string

attr

string

Returns

string[]

Inherited from

BaseParser.extractAttributes


extractSpans()

protected extractSpans(html, className?): string[]

Defined in: packages/base/BaseParser.ts:622

Parameters

html

string

className?

string

Returns

string[]

Inherited from

BaseParser.extractSpans


extractDivs()

protected extractDivs(html, className?): string[]

Defined in: packages/base/BaseParser.ts:626

Parameters

html

string

className?

string

Returns

string[]

Inherited from

BaseParser.extractDivs


extractAnchorsContent()

protected extractAnchorsContent(html, className?): string[]

Defined in: packages/base/BaseParser.ts:630

Parameters

html

string

className?

string

Returns

string[]

Inherited from

BaseParser.extractAnchorsContent


extractH2s()

protected extractH2s(html, className?): string[]

Defined in: packages/base/BaseParser.ts:634

Parameters

html

string

className?

string

Returns

string[]

Inherited from

BaseParser.extractH2s


extractH3s()

protected extractH3s(html, className?): string[]

Defined in: packages/base/BaseParser.ts:638

Parameters

html

string

className?

string

Returns

string[]

Inherited from

BaseParser.extractH3s


extractLists()

protected extractLists(html, className?): string[]

Defined in: packages/base/BaseParser.ts:642

Parameters

html

string

className?

string

Returns

string[]

Inherited from

BaseParser.extractLists


extractBlocks()

protected extractBlocks(html, tag, className?): string[]

Defined in: packages/base/BaseParser.ts:646

Parameters

html

string

tag

string

className?

string

Returns

string[]

Inherited from

BaseParser.extractBlocks


extractKeyValue()

protected extractKeyValue(html, keyPattern, valuePattern): Record<string, string>

Defined in: packages/base/BaseParser.ts:654

Parameters

html

string

keyPattern

RegExp

valuePattern

RegExp

Returns

Record<string, string>

Inherited from

BaseParser.extractKeyValue


collectByClassNames()

protected collectByClassNames(html, classNames, options?): any[]

Defined in: packages/base/BaseParser.ts:669

Parameters

html

string

classNames

string | string[]

options?
includeInnerHTML?

boolean

attributes?

string[]

sourceUrl?

string

Returns

any[]

Inherited from

BaseParser.collectByClassNames


transform()

transform(html, sourceUrl): Partial<DefaultExecutionResult<Partial<WikimediaOutput>>>

Defined in: packages/providers/wikimedia/WikimediaParser.ts:8

Extracts common metadata from a fetched HTML document.

Parameters

html

string

Raw HTML returned by the HTTP engine.

sourceUrl

string

Final URL used as the metadata source.

Returns

Partial<DefaultExecutionResult<Partial<WikimediaOutput>>>

Common extracted fields used as the base provider result.

Overrides

BaseParser.transform