Skip to content

Latest commit

 

History

History
1085 lines (577 loc) · 19.1 KB

File metadata and controls

1085 lines (577 loc) · 19.1 KB

downflux


downflux / BaseParser

Class: BaseParser

Defined in: packages/base/BaseParser.ts:15

Default HTML parser shared by provider-specific parsers.

Remarks

Parsers exist to keep extraction rules close to the HTML they understand. The base parser collects common page fields such as anchors, images, meta tags, and source URLs, while provider parsers add the site-specific fields needed by transformers and pipelines.

Extended by

Constructors

Constructor

new BaseParser(): BaseParser

Returns

BaseParser

Properties

kvsResolver

protected kvsResolver: KvsResolver

Defined in: packages/base/BaseParser.ts:16

Methods

transform()

transform(html, sourceUrl): Partial<DefaultExecutionResult>

Defined in: packages/base/BaseParser.ts:25

Extracts common metadata from a fetched HTML document.

Parameters

html

string

Raw HTML returned by the HTTP engine.

sourceUrl

string

Final URL used as the metadata source.

Returns

Partial<DefaultExecutionResult>

Common extracted fields used as the base provider result.


extractScriptMethodInput()

protected extractScriptMethodInput(fnName, html): string | null

Defined in: packages/base/BaseParser.ts:54

Extracts the first string argument passed to a named script function.

Parameters

fnName

string

Function name to search for.

html

string

HTML or script text to inspect.

Returns

string | null

The first string argument, or null when the call is absent.


getFlashVars()

protected getFlashVars(html): FlashVarsOutput

Defined in: packages/base/BaseParser.ts:66

Extracts KVS flashVars video metadata from inline scripts.

Parameters

html

string

HTML containing one or more KVS flashVars blocks.

Returns

FlashVarsOutput

Normalized KVS fields, video sources, previews, and timelines.


extractElementText()

protected extractElementText(html, begin, end, fallback?): string

Defined in: packages/base/BaseParser.ts:190

Parameters

html

string

begin

string

end

string

fallback?

string = ''

Returns

string


extractElementTextPair()

protected extractElementTextPair(html, begin, end, pos?): [string | null, number]

Defined in: packages/base/BaseParser.ts:199

Parameters

html

string

begin

string

end

string

pos?

number = 0

Returns

[string | null, number]


extractAllPairs()

protected extractAllPairs(html, begin, end): Generator<string>

Defined in: packages/base/BaseParser.ts:208

Parameters

html

string

begin

string

end

string

Returns

Generator<string>


extractAll()

protected extractAll(html, rules, startPos?): [Record<string, string>, number]

Defined in: packages/base/BaseParser.ts:228

Parameters

html

string

rules

[string, string, string][]

startPos?

number = 0

Returns

[Record<string, string>, number]


extractAnchors()

protected extractAnchors(html, sourceUrl?): string[]

Defined in: packages/base/BaseParser.ts:243

Parameters

html

string

sourceUrl?

string

Returns

string[]


extractAnchorTextsByHref()

protected extractAnchorTextsByHref(html, hrefPattern): string[]

Defined in: packages/base/BaseParser.ts:258

Parameters

html

string

hrefPattern

RegExp

Returns

string[]


extractImageUrls()

protected extractImageUrls(html, sourceUrl?): string[]

Defined in: packages/base/BaseParser.ts:277

Parameters

html

string

sourceUrl?

string

Returns

string[]


extractSourceUrls()

protected extractSourceUrls(html, sourceUrl?): string[]

Defined in: packages/base/BaseParser.ts:316

Parameters

html

string

sourceUrl?

string

Returns

string[]


getFlashVarsVideo()

protected getFlashVarsVideo(html, sourceUrl, uploader?, starred?): DefaultFlashVarsVideoOutput

Defined in: packages/base/BaseParser.ts:333

Parameters

html

string

sourceUrl

string

uploader?

string

starred?

string[]

Returns

DefaultFlashVarsVideoOutput


collectElements()

protected collectElements(html, type, className?): Record<string, string>[]

Defined in: packages/base/BaseParser.ts:356

Parameters

html

string

type

string

className?

string

Returns

Record<string, string>[]


extractVideoPosters()

protected extractVideoPosters(html, sourceUrl?): string[]

Defined in: packages/base/BaseParser.ts:377

Parameters

html

string

sourceUrl?

string

Returns

string[]


extractDivHrefs()

protected extractDivHrefs(html, sourceUrl?): string[]

Defined in: packages/base/BaseParser.ts:394

Parameters

html

string

sourceUrl?

string

Returns

string[]


extractVideoUrls()

protected extractVideoUrls(html): string[]

Defined in: packages/base/BaseParser.ts:405

Parameters

html

string

Returns

string[]


extractAllUrls()

protected extractAllUrls(html): string[]

Defined in: packages/base/BaseParser.ts:415

Parameters

html

string

Returns

string[]


extractLinks()

protected extractLinks(html): string[]

Defined in: packages/base/BaseParser.ts:419

Parameters

html

string

Returns

string[]


extractMetaDescription()

protected extractMetaDescription(html): string

Defined in: packages/base/BaseParser.ts:431

Parameters

html

string

Returns

string


extractMetaNameContent()

protected extractMetaNameContent(html, value): string

Defined in: packages/base/BaseParser.ts:439

Parameters

html

string

value

string

Returns

string


extractMetaPropertyContent()

protected extractMetaPropertyContent(html, value): string

Defined in: packages/base/BaseParser.ts:447

Parameters

html

string

value

string

Returns

string


collectAnchors()

protected collectAnchors(html, options?): object[]

Defined in: packages/base/BaseParser.ts:455

Parameters

html

string

options?
sourceUrl?

string

className?

string

hrefPattern?

RegExp

Returns

object[]


extractMetaKeywords()

protected extractMetaKeywords(html): string[]

Defined in: packages/base/BaseParser.ts:507

Parameters

html

string

Returns

string[]


extractTitle()

protected extractTitle(html): string

Defined in: packages/base/BaseParser.ts:517

Parameters

html

string

Returns

string


resolveUrl()

protected resolveUrl(raw, base?): string | null

Defined in: packages/base/BaseParser.ts:521

Parameters

raw

string

base?

string

Returns

string | null


isHttpUrl()

protected isHttpUrl(url?): url is string

Defined in: packages/base/BaseParser.ts:530

Parameters

url?

string | null

Returns

url is string


decodeHtmlEntities()

protected decodeHtmlEntities(str): string

Defined in: packages/base/BaseParser.ts:534

Parameters

str

string

Returns

string


extractByTag()

protected extractByTag(html, tag, options?): string[]

Defined in: packages/base/BaseParser.ts:545

Parameters

html

string

tag

string

options?
className?

string

attribute?

string

Returns

string[]


extractOneByTag()

protected extractOneByTag(html, tag, options?): string | null

Defined in: packages/base/BaseParser.ts:571

Parameters

html

string

tag

string

options?
className?

string

Returns

string | null


extractScriptsByType()

protected extractScriptsByType(html, type, objectType?): Record<string, any>[]

Defined in: packages/base/BaseParser.ts:575

Parameters

html

string

type

string

objectType?

string

Returns

Record<string, any>[]


extractByClass()

protected extractByClass(html, className): string[]

Defined in: packages/base/BaseParser.ts:595

Parameters

html

string

className

string

Returns

string[]


extractAttributes()

protected extractAttributes(html, tag, attr): string[]

Defined in: packages/base/BaseParser.ts:609

Parameters

html

string

tag

string

attr

string

Returns

string[]


extractSpans()

protected extractSpans(html, className?): string[]

Defined in: packages/base/BaseParser.ts:622

Parameters

html

string

className?

string

Returns

string[]


extractDivs()

protected extractDivs(html, className?): string[]

Defined in: packages/base/BaseParser.ts:626

Parameters

html

string

className?

string

Returns

string[]


extractAnchorsContent()

protected extractAnchorsContent(html, className?): string[]

Defined in: packages/base/BaseParser.ts:630

Parameters

html

string

className?

string

Returns

string[]


extractH2s()

protected extractH2s(html, className?): string[]

Defined in: packages/base/BaseParser.ts:634

Parameters

html

string

className?

string

Returns

string[]


extractH3s()

protected extractH3s(html, className?): string[]

Defined in: packages/base/BaseParser.ts:638

Parameters

html

string

className?

string

Returns

string[]


extractLists()

protected extractLists(html, className?): string[]

Defined in: packages/base/BaseParser.ts:642

Parameters

html

string

className?

string

Returns

string[]


extractBlocks()

protected extractBlocks(html, tag, className?): string[]

Defined in: packages/base/BaseParser.ts:646

Parameters

html

string

tag

string

className?

string

Returns

string[]


extractKeyValue()

protected extractKeyValue(html, keyPattern, valuePattern): Record<string, string>

Defined in: packages/base/BaseParser.ts:654

Parameters

html

string

keyPattern

RegExp

valuePattern

RegExp

Returns

Record<string, string>


collectByClassNames()

protected collectByClassNames(html, classNames, options?): any[]

Defined in: packages/base/BaseParser.ts:669

Parameters

html

string

classNames

string | string[]

options?
includeInnerHTML?

boolean

attributes?

string[]

sourceUrl?

string

Returns

any[]