From aa46b2ff06e467c41b623a16908d9e71709bfa80 Mon Sep 17 00:00:00 2001 From: Sam Pepler Date: Tue, 10 Feb 2026 16:41:33 +0000 Subject: [PATCH] added more records for md5 checks --- .DS_Store | Bin 0 -> 8196 bytes AUDIT.md | 202 +++++++++++++++++++++++++++++++++++++++ fbi_core/fbi_filesize.py | 4 +- fbi_core/fbi_tools.py | 2 +- 4 files changed, 204 insertions(+), 4 deletions(-) create mode 100644 .DS_Store create mode 100644 AUDIT.md diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..c1b1a9f72e89f0c7d24b11dd5bfa732260e1081f GIT binary patch literal 8196 zcmeHMF;7!b6h4<0tf`nXm@vM?K^-tKIsg?aF%B;L0BUJ1B$U@kA>d$|sELz77EK&X zOf=C+7hP~uAr7K0nn*}o+zkn{-?{hNo=f}MIw|2^a_@P$=R5a&_v?G-~ zi1)tn`q3-k6*!a%aD52S*`}?n8mmhO27Ls8jbmCjT&MT~Hn28rZPi$K(3A; zZCUzCVJ4HC3v-xDym@fr_SVS8dDowB=`YWSZv&(HDLwKhFU`;*<(%vkyNlKLAtZ;L zfyrAZ$KNrB22`C%G|J&)icS)IO0+^{^eLLAE9f&+ki68&_G4s>KuUfmUVWV?k86!0 z(RH5f(UPxP&4L8=f-iHeP=OXOMsP-zUT=+-ynj5|Nj-o0o^@@pekrVT?<_*{C(2@Z zs5pf`-a$=ko{@SJ9ovpZO5THO6W=ZGh;=3B_Gp)HzGv<|yxQR9nwnujKXU z%X(_~=3CZN#7i#C5l;TDPnaX9lnM|g0A8YnFZTv@V(|%4vNJ#0c*fWj;BVi|NLm`Y7<=jiED*( zc*R$lS>JwMs2c@T{2~p2?L54ptLf3tokdCRA0FOYdEKMyT?hH3)_)i2?9bGZI4xm6 z^Ry&)(S85SFYKb`lU#TA_EMwfnH2uIzRmnHZ_k}HmOk(;W_Hle=l=`Q-~SK%KK%x~ z0$zb=1ynpUnHhs$MPT@>9iV5?xiPM4tPa7z5T4h^aoE~F3~?P;U)tKLvGAb%gAW1z M{qNuZhyMNl8%*yvumAu6 literal 0 HcmV?d00001 diff --git a/AUDIT.md b/AUDIT.md new file mode 100644 index 0000000..5bbb3bf --- /dev/null +++ b/AUDIT.md @@ -0,0 +1,202 @@ + + +looking at autit output + +what do we do if a file appears to be corrupt: + +example + +```json +{ + "path": "/neodc/comet/data/licsar_products/153/153D_04699_131413/20170409_20170626/20170409_20170626.geo.unw.png", + "type": "file", + "directory": "/neodc/comet/data/licsar_products/153/153D_04699_131413/20170409_20170626", + "name": "20170409_20170626.geo.unw.png", + "ext": ".png", + "location": "on_disk", + "size": 759459, + "last_modified": "2022-03-10T11:49:21", + "created": "2020-11-02T21:21:02", + "md5": "1495e505ea0ecb2ec61c3b8c216fd562", + "last_audit": "2025-07-28T06:21:54.695653", + "fileset": "spot-38639-licsar_products", + "regex_date": "2017-04-09", + "corrupted": "2025-07-28T06:21:54.695631", + "corrupt_md5": "015ec317bd796e78716e3bf7a83fe8ff" +} +``` + +This is an apparently corrupt file. What are the options? + +1) Accept the file is corrupt, retrive from backup or other source, replace file with restored content. Make a not in the record that this happened. +2) Accept the file is corrupt, no useable backup. note the file is corrupt and keep as is. Flag that we have accepted that we can recover this file. +3) If we do not beleave the file looks is corrupt then flag we have accepted the newer checksum as the content. + + +How do you know its corrupt? + +1) does not open in viewer or reader +2) blanks +3) storage system problems + +## action to reset + - all cases need some record of audit fixing. + - Curret unfixed corruption records have a "corrupted" key and a "corrupt_md5" key + - + +case 1) would update the record and Replace the file with the recovered one: + + ```json +{ + "path": "/neodc/comet/data/licsar_products/153/153D_04699_131413/20170409_20170626/20170409_20170626.geo.unw.png", + "type": "file", + "directory": "/neodc/comet/data/licsar_products/153/153D_04699_131413/20170409_20170626", + "name": "20170409_20170626.geo.unw.png", + "ext": ".png", + "location": "on_disk", + "size": 759459, + "last_modified": "2025-10-29T09:05:43.45678", # overwrite time + "created": "2020-11-02T21:21:02", + "md5": "1495e505ea0ecb2ec61c3b8c216fd562", # recovered file has same checksum + "last_audit": "2025-07-28T06:21:54.695653", + "fileset": "spot-38639-licsar_products", + "regex_date": "2017-04-09", + "corruption_records": [ + { + "md5": "1495e505ea0ecb2ec61c3b8c216fd562", + "last_modified": "2022-03-10T11:49:21", + "corrupted": "2025-07-28T06:21:54.695631", + "corrupt_md5": "015ec317bd796e78716e3bf7a83fe8ff", + "size": 759459, + "reset_type": "recovered", + "reset_date": "2025-10-29T09:05:44.12345" + } + ] +} +``` + +case 2) would update the record to reflect we can not take action to recover it: + + ```json +{ + "path": "/neodc/comet/data/licsar_products/153/153D_04699_131413/20170409_20170626/20170409_20170626.geo.unw.png", + "type": "file", + "directory": "/neodc/comet/data/licsar_products/153/153D_04699_131413/20170409_20170626", + "name": "20170409_20170626.geo.unw.png", + "ext": ".png", + "location": "on_disk", + "size": 759459, + "last_modified": "2022-03-10T11:49:21", + "created": "2020-11-02T21:21:02", + "md5": "015ec317bd796e78716e3bf7a83fe8ff", # corrupt checksum accepted + "last_audit": "2025-07-28T06:21:54.695653", + "fileset": "spot-38639-licsar_products", + "regex_date": "2017-04-09", + "corruption_records": [ + { + "md5": "1495e505ea0ecb2ec61c3b8c216fd562", + "last_modified": "2022-03-10T11:49:21", + "corrupted": "2025-07-28T06:21:54.695631", + "corrupt_md5": "015ec317bd796e78716e3bf7a83fe8ff", + "size": 759459, + "reset_type": "unrecoverable", + "reset_date": "2025-10-29T09:05:44.12345" + } + ] +} +``` + + +case 3) would update the record to reflect we do not think the file is corrupt: + + ```json +{ + "path": "/neodc/comet/data/licsar_products/153/153D_04699_131413/20170409_20170626/20170409_20170626.geo.unw.png", + "type": "file", + "directory": "/neodc/comet/data/licsar_products/153/153D_04699_131413/20170409_20170626", + "name": "20170409_20170626.geo.unw.png", + "ext": ".png", + "location": "on_disk", + "size": 759459, + "last_modified": "2022-03-10T11:49:21", + "created": "2020-11-02T21:21:02", + "md5": "015ec317bd796e78716e3bf7a83fe8ff", # current checksum + "last_audit": "2025-07-28T06:21:54.695653", + "fileset": "spot-38639-licsar_products", + "regex_date": "2017-04-09", + "corruption_records": [ + { + "md5": "1495e505ea0ecb2ec61c3b8c216fd562", + "last_modified": "2022-03-10T11:49:21", + "corrupted": "2025-07-28T06:21:54.695631", + "corrupt_md5": "015ec317bd796e78716e3bf7a83fe8ff", + "size": 759459, + "reset_type": "false positive", + "reset_date": "2025-10-29T09:05:44.12345" + } + ] +} +``` + + +General case + + ```json +{ + "path": "/neodc/comet/data/licsar_products/153/153D_04699_131413/20170409_20170626/20170409_20170626.geo.unw.png", + "type": "file", + "directory": "/neodc/comet/data/licsar_products/153/153D_04699_131413/20170409_20170626", + "name": "20170409_20170626.geo.unw.png", + "ext": ".png", + "location": "on_disk", + "size": 759459, + "last_modified": "2025-10-29T09:05:55.8765", + "created": "2020-11-02T21:21:02", + "md5": "1495e505ea0ecb2ec61c3b8c216fd562", + "last_audit": "2025-07-28T06:21:54.695653", + "fileset": "spot-38639-licsar_products", + "regex_date": "2017-04-09", + "change_history": [ + { + "old_record": { + "path": "/neodc/comet/data/licsar_products/153/153D_04699_131413/20170409_20170626/20170409_20170626.geo.unw.png", + "type": "file", + "directory": "/neodc/comet/data/licsar_products/153/153D_04699_131413/20170409_20170626", + "name": "20170409_20170626.geo.unw.png", + "ext": ".png", + "location": "on_disk", + "size": 759459, + "last_modified": "2022-03-10T11:49:21", + "created": "2020-11-02T21:21:02", + "md5": "1495e505ea0ecb2ec61c3b8c216fd562", + "last_audit": "2025-07-28T06:21:54.695653", + "fileset": "spot-38639-licsar_products", + "regex_date": "2017-04-09", + "corrupted": "2025-07-28T06:21:54.695631", + "corrupt_md5": "015ec317bd796e78716e3bf7a83fe8ff" + }, + "change": "reset corrupt record ready for overwrite", + "change_time": "2025-10-29T09:05:44.12345" + }, + { + "old_record": { + "path": "/neodc/comet/data/licsar_products/153/153D_04699_131413/20170409_20170626/20170409_20170626.geo.unw.png", + "type": "file", + "directory": "/neodc/comet/data/licsar_products/153/153D_04699_131413/20170409_20170626", + "name": "20170409_20170626.geo.unw.png", + "ext": ".png", + "location": "on_disk", + "size": 759459, + "last_modified": "2022-03-10T11:49:21", + "created": "2020-11-02T21:21:02", + "md5": "1495e505ea0ecb2ec61c3b8c216fd562", + "last_audit": "2025-07-28T06:21:54.695653", + "fileset": "spot-38639-licsar_products", + "regex_date": "2017-04-09" + }, + "change": "modified", + "change_time": "2025-10-29T09:05:55.8765" + } + ] +} +``` \ No newline at end of file diff --git a/fbi_core/fbi_filesize.py b/fbi_core/fbi_filesize.py index 8dab9f7..42081a8 100644 --- a/fbi_core/fbi_filesize.py +++ b/fbi_core/fbi_filesize.py @@ -46,10 +46,8 @@ def __init__(self, *args, **kwargs): @click.command(cls=FilterCommand) @click.argument("paths", nargs=-1) -@click.option("--record", help="Show complete FBI record of latest files", is_flag=True) +@click.option("--record", help="Show complete FBI record", is_flag=True) def ls2(paths, record, **kwargs): - t0 = time.time() - t00 = time.time() for path in paths: for i, f in enumerate(fbi_records_under(path, **kwargs)): if record: diff --git a/fbi_core/fbi_tools.py b/fbi_core/fbi_tools.py index e9ad6bc..72eeb6a 100644 --- a/fbi_core/fbi_tools.py +++ b/fbi_core/fbi_tools.py @@ -571,7 +571,7 @@ def get_records_by_content(md5, filename=None, under=None, include_removed=False must.append({"term": {"name.keyword": {"value": filename}}}) query = {"query": {"bool": {"must": must, "must_not": must_not}}} - results = es.search(index=indexname, body=query, request_timeout=90) + results = es.search(index=indexname, body=query, request_timeout=90, size=10000) records = [] for r in results["hits"]["hits"]: records.append(r["_source"])