diff --git a/dev/sft/dataset.jsonl b/dev/sft/dataset.jsonl new file mode 100644 index 00000000..bd223041 --- /dev/null +++ b/dev/sft/dataset.jsonl @@ -0,0 +1,50 @@ +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a medical_record. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"**Confidential Medical Record**\n\n**Patient Information:**\n\n- **Name:** Victor Roberts\n- **Date of Birth:** 1982-04-07\n- **Age:** 28\n- **Gender:** Male\n- **Personal ID:** 260-56-6683\n\n---\n\n**Medical History Overview:**\n\n**Current Medical Condition:**\n- **Diagnosis:** Aneurysm \n- **Date of Diagnosis:** 2010-09-15 \n- **Presenting Symptoms:** Severe headaches, vision impairment, dizziness.\n\n**Treatment Plan:**\n- **Medications Prescribed:**\n - Metoprolol 50 mg, once daily\n - Aspirin 81 mg, once daily\n- **Scheduled Follow-ups:** Every 3 months with Neurology Department\n\n**Previous Medical History:**\n- **2001:** Tonsillectomy\n- **1996:** Admitted for severe bout of influenza\n- **Allergies:** Penicillin\n\n---\n\n**Lifestyle and Social History:**\n\n- **Occupation:** Structural Engineer\n- **Smoking Status:** Non-smoker\n- **Alcohol Consumption:** Occasional, primarily socially\n- **Exercise Habits:** Engages in light exercises twice a week\n\n---\n\n**Family Medical History:**\n\n- **Father:** Hypertension\n- **Mother:** No known medical conditions\n- **Siblings:** One brother, no known medical conditions\n\n---\n\n**Emergency Contact:**\n\n- **Name:** Linda Roberts\n- **Relationship:** Sister\n- **Phone Number:** [Redacted]\n\n---\n\n**Consultations:**\n\n**Neurologist:** Dr. Emily Chen \n**Next Appointment:** 2023-11-12 at 10:00 AM\n\n**Cardiologist:** Dr. Raj Singh \n**Next Appointment:** 2023-11-20 at 2:30 PM\n\n**Notes:**\n- Continue to monitor blood pressure closely.\n- MRI scan of the brain scheduled for 2023-12-05 to evaluate treatment effectiveness.\n\n---\n\n**Doctor's Notes:**\n\n- **Reviewed by:** Dr. Natalie Brooks, MD\n- **Date of Review:** 2023-10-19\n\n**Remarks:**\n- Patient remains attentive to prescribed treatment and lifestyle recommendations. \n- Mr. Roberts reports no new symptoms or concerns since the last visit.\n- Encourage continuation of cardiovascular adherence and stress-reducing activities.\n\n**End of Record**"},{"content":"{\"fields_to_redact\":[{\"string\":\"Victor Roberts\",\"pii_type\":\"person_name\"},{\"string\":\"1982-04-07\",\"pii_type\":\"date_of_birth\"},{\"string\":\"28\",\"pii_type\":\"age\"},{\"string\":\"Male\",\"pii_type\":\"gender\"},{\"string\":\"260-56-6683\",\"pii_type\":\"personal_id\"},{\"string\":\"Aneurysm\",\"pii_type\":\"medical_condition\"},{\"string\":\"2010-09-15\",\"pii_type\":\"date\"},{\"string\":\"Penicillin\",\"pii_type\":\"medical_condition\"},{\"string\":\"Linda Roberts\",\"pii_type\":\"person_name\"},{\"string\":\"Dr. Emily Chen\",\"pii_type\":\"person_name\"},{\"string\":\"2023-11-12\",\"pii_type\":\"date\"},{\"string\":\"Dr. Raj Singh\",\"pii_type\":\"person_name\"},{\"string\":\"2023-11-20\",\"pii_type\":\"date\"},{\"string\":\"2023-12-05\",\"pii_type\":\"date\"},{\"string\":\"Dr. Natalie Brooks, MD\",\"pii_type\":\"person_name\"},{\"string\":\"2023-10-19\",\"pii_type\":\"date\"},{\"string\":\"Hypertension\",\"pii_type\":\"medical_condition\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a company_memo. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"---\n\n**Memo**\n\n**To**: All Employees \n**From**: Management \n**Date**: January 11, 2011 \n**Subject**: Update on Internal Policies and Procedures \n\nDear Team,\n\nAs we strive towards continuous excellence at Clay Group, we are implementing some updates to our internal policies and procedures, effective immediately. Please take a moment to review these changes and how they may impact your daily operations.\n\n**1. Emergency Protocol** \nThe safety and well-being of our employees are a top priority. We have updated our emergency contact procedures. In case of any urgent situations, please contact the main office line at 001-835-693-0988 with the following extension: 8706. Make sure this number is readily accessible.\n\n**2. Cybersecurity Policy** \nWith the recent rise in cybersecurity threats, it is imperative to maintain a robust security posture. Starting next month, all team members will be required to undergo additional training on the secure handling of sensitive information. Details will be communicated in due course.\n\n**3. Remote Working Policies** \nGiven the positive feedback from our previous trials, we are pleased to announce the continuation of our flexible working arrangements. Employees can choose to work remotely up to two days a week, pending manager approval and adherence to productivity benchmarks.\n\n**4. Internal Communications** \nTo enhance transparency and communication within the organization, we are introducing a new company-wide digital bulletin board. This platform will be used for announcements, team achievements, and events. More information on accessing the bulletin board will follow.\n\nYour cooperation and dedication are vital as we work together to maintain Clay Group's reputation as a leader in innovation and employee satisfaction. If you have any questions regarding these updates, do not hesitate to contact your department head.\n\nThank you for your continued hard work and commitment.\n\nBest regards,\n\n[Signature, if applicable]\n\nManagement \nClay Group\n\n---"},{"content":"{\"fields_to_redact\":[{\"string\":\"January 11, 2011\",\"pii_type\":\"date\"},{\"string\":\"Clay Group\",\"pii_type\":\"organization_name\"},{\"string\":\"001-835-693-0988\",\"pii_type\":\"phone_number\"},{\"string\":\"8706\",\"pii_type\":\"other_id\"},{\"string\":\"Clay Group\",\"pii_type\":\"organization_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a company_memo. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"---\n\n**MEMORANDUM**\n\n**TO:** All Staff\n\n**FROM:** Amy Singleton, Director of Human Resources\n\n**DATE:** January 29, 1993\n\n**SUBJECT:** Exciting New Developments at O'Connor-Thomas!\n\nDear Team,\n\nI am thrilled to reach out to each of you on behalf of O'Connor-Thomas, an organization that prides itself on innovation and excellence. As we continue our journey of growth and success, I would like to share some exciting updates and opportunities that await us.\n\n**Innovation and Sustainability Initiatives**\n\nFirst, I am pleased to announce that we will be launching a new series of sustainability initiatives designed to reduce our environmental footprint. As a sector leader, O'Connor-Thomas is committed to responsible innovation, and we are adamant about making our operations greener. More details will be provided in the upcoming weeks.\n\n**Employee Development Workshops**\n\nWe understand the importance of continuous learning, which is why we are introducing a series of workshops focused on skill enhancement across various departments. These workshops aim to augment your technical and professional skills, ensuring our team remains at the forefront of industry trends. Participation details will soon follow.\n\n**Building a Stronger Community**\n\nOur company believes in the power of community. This year, we will increase our involvement with local charities and community events. Volunteering tells a great story of who we are as a company and individuals. Let's make a positive impact together!\n\nPlease feel free to reach out to me if you have any questions or suggestions regarding our upcoming projects. Feedback is always appreciated as it helps us build a better and more inclusive workplace. Together, we can achieve more!\n\nLooking forward to all the great things 1993 will bring for O'Connor-Thomas and each of us individually.\n\nWarm regards,\n\n**Amy Singleton** \nDirector of Human Resources \nO’Connor-Thomas \n\n---\n\nRemember, keep pushing boundaries and stay inspired!\n\n---"},{"content":"{\"fields_to_redact\":[{\"string\":\"January 29, 1993\",\"pii_type\":\"date\"},{\"string\":\"O'Connor-Thomas\",\"pii_type\":\"organization_name\"},{\"string\":\"Amy Singleton\",\"pii_type\":\"person_name\"},{\"string\":\"Amy Singleton\",\"pii_type\":\"person_name\"},{\"string\":\"O’Connor-Thomas\",\"pii_type\":\"organization_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a medical_record. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"```plaintext\n CONFIDENTIAL MEDICAL RECORD\n\nPatient Name: Brenda Davila\nAge: 58\nGender: Male\nDate of Birth: 25th December 2022 Identifier Number: ZZ025922T\n\n--------------------------------------------------------------------------\nADDRESS: \n529 Tina Villages Suite 522\nLake Robert, CO 31592\n\nPHONE: +1-555-834-9771\n\n--------------------------------------------------------------------------\nMEDICAL DIAGNOSIS:\n\nCondition: Lead Poisoning\n\nBackground: \nLead poisoning is a type of metal poisoning caused by increased levels of the heavy metal lead in the body. Exposure to lead over the long term can cause harmful effects on the brain, liver, kidney, and reproductive systems, among others.\n\nPresenting Symptoms: \n- Abdominal Pain\n- Fatigue\n- Irritability\n- Memory Loss\n\nCurrent Treatment Plan:\n- Chelation Therapy: Succimer 10 mg/kg orally every 8 hours for 5 days\n- Regular Monitoring of Blood Lead Levels (BLL)\n- Dietary Adjustments: Ensure high intake of calcium, iron, and vitamin C\n- Environmental modifications to reduce lead exposure\n\nNext Review Appointment: 15th November 2023\n\n--------------------------------------------------------------------------\nNotes:\n- Due to age discrepancy, additional verification documents for identity and medical history may be required.\n- Careful consideration of residential environment as potential source of lead.\n- Emergency Contact details were not provided.\n\n---------------------------------------------------------------------------\n\nDoctor: Dr. Anthony Williamson \nLicense No: MD335912\nContact: a.williamson@lakerobertclinic.com \n\nThis document is intended for the sole use of the individual to whom it is addressed and may contain confidential and privileged information. Any unauthorized review, use, disclosure or distribution is prohibited. If you are not the intended recipient, please contact the sender by reply email and destroy all copies of the original message.\n```"},{"content":"{\"fields_to_redact\":[{\"string\":\"Brenda Davila\",\"pii_type\":\"person_name\"},{\"string\":\"58\",\"pii_type\":\"age\"},{\"string\":\"Male\",\"pii_type\":\"gender\"},{\"string\":\"25th December 2022\",\"pii_type\":\"date_of_birth\"},{\"string\":\"ZZ025922T\",\"pii_type\":\"personal_id\"},{\"string\":\"529 Tina Villages Suite 522\\nLake Robert, CO 31592\",\"pii_type\":\"street_address\"},{\"string\":\"+1-555-834-9771\",\"pii_type\":\"phone_number\"},{\"string\":\"Lead Poisoning\",\"pii_type\":\"medical_condition\"},{\"string\":\"15th November 2023\",\"pii_type\":\"date\"},{\"string\":\"Dr. Anthony Williamson\",\"pii_type\":\"person_name\"},{\"string\":\"a.williamson@lakerobertclinic.com\",\"pii_type\":\"email_address\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a personal_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Long Time No See!\n\nHi Margaud-Anaïs,\n\nI hope this email finds you well. It’s been ages since we last caught up, and I've missed our conversations. I remember the last time we chatted, you mentioned that you were working on an exciting project. How's that going? \n\nBy the way, I was reminiscing about that hilarious incident during our holiday trip back in 2003. Unbelievably, January 2nd, 2003, will always be etched in my memory as the day we nearly missed that flight because we were too engrossed in spotting cute street cats. Time really flies!\n\nAnyway, I'm reaching out because I’m planning a small get-together with some old friends next month and I would love for you to join us. Let me know if you're available – it would be so nice to catch up in person and relive some of those fun memories.\n\nPlease feel free to either shoot me an email at josephestrada@example.org or give me a call at 03 45 02 61 71 when you get a chance.\n\nLooking forward to hearing from you soon!\n\nWarm regards,\n\nJoseph"},{"content":"{\"fields_to_redact\":[{\"string\":\"Margaud-Anaïs\",\"pii_type\":\"person_name\"},{\"string\":\"January 2nd, 2003\",\"pii_type\":\"date\"},{\"string\":\"josephestrada@example.org\",\"pii_type\":\"email_address\"},{\"string\":\"03 45 02 61 71\",\"pii_type\":\"phone_number\"},{\"string\":\"Joseph\",\"pii_type\":\"person_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a personal_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Catching Up and Exciting News!\n\nHi James,\n\nI hope this email finds you well! It’s been too long since we last caught up. I wanted to take a moment to reach out and share some exciting updates with you.\n\nFirst off, can you believe it’s already been a year since we had that unforgettable trip to the Grand Canyon? Time really flies! It was truly an experience to remember.\n\nOn a personal note, I’ve embarked on a new project that I’m really passionate about. I’m working with a local non-profit organization focused on environmental conservation here in Seville. It’s been incredibly rewarding to see the impact we’re making and I’d love to tell you more about it.\n\nAlso, since you are the ultimate movie buff, I wanted to get your recommendations on the latest sci-fi flicks. I’m planning a weekend movie marathon, and nobody curates a list quite like you do!\n\nLooking forward to hearing from you soon. Let's try to set up a call or something sometime next month?\n\nBest wishes,\n\nAlejo Villaverde Morcillo\n\nP.S.: If you’re up for a little nostalgia, I stumbled upon some old photos from our hiking days. Happy to share them if you’re interested!\n\nSent on: Monday, 28th February 2005"},{"content":"{\"fields_to_redact\":[{\"string\":\"James\",\"pii_type\":\"person_name\"},{\"string\":\"Seville\",\"pii_type\":\"street_address\"},{\"string\":\"Alejo Villaverde Morcillo\",\"pii_type\":\"person_name\"},{\"string\":\"Monday, 28th February 2005\",\"pii_type\":\"date\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a support_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Urgent Assistance Required for Jessica Anderson\n\nFrom: janice34@example.net \nDate: 2024-07-30 \nTo: support@pmp-services.com \n\nHi Support Team,\n\nI hope this email finds you well. My name is Jessica Anderson, and I am having some trouble accessing my account with your organization, Pichardo, Montez y Padilla. I attempted several times today and was unable to log in successfully.\n\nDetails for verification:\n- Date of Birth: 1996-10-27\n- Personal ID: 564 588 200\n- Banking Number: EALY56899882811142\n- Registered Phone Number: +44(0)1174960628\n\nThe error message said something about account authorization, which I haven't encountered before. I suspect it may have something to do with recent changes in my profile information that I updated last week. Could you please verify my account status and assist me in restoring access as soon as possible?\n\nThank you for your prompt attention to this matter. I'm looking forward to your urgent response.\n\nBest regards, \nJessica Anderson"},{"content":"{\"fields_to_redact\":[{\"string\":\"Jessica Anderson\",\"pii_type\":\"person_name\"},{\"string\":\"janice34@example.net\",\"pii_type\":\"email_address\"},{\"string\":\"Pichardo, Montez y Padilla\",\"pii_type\":\"organization_name\"},{\"string\":\"1996-10-27\",\"pii_type\":\"date_of_birth\"},{\"string\":\"564 588 200\",\"pii_type\":\"personal_id\"},{\"string\":\"EALY56899882811142\",\"pii_type\":\"banking_number\"},{\"string\":\"+44(0)1174960628\",\"pii_type\":\"phone_number\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a utility_bill. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"```\nSunshine Power & Utilities Co.\nCustomer Service Hotline: 1-800-432-0098\nEmail: support@sunpowerutilities.net\nWebsite: www.sunpowerutilities.net\n\n---------------------------------------------------------------------\nUTILITY BILL STATEMENT\n---------------------------------------------------------------------\n\nAccount Number: 2049-987-231 Date: 1995-07-11\n\nBilling Summary:\n----------------------------------------------\nName: Mtro. Catalina Olvera\nService Address: 749 Danielle Lakes Apt. 791\n Lake Davidton, NU E3L 3V2\n\n---------------------------------------------------------------------\n\nElectricity Consumption Details:\n----------------------------------------------\nBilling Period: Jun 01, 1995 - Jun 30, 1995\nMeter Number: EL-67234-NE\n\nPrevious Reading: 12,345 kWh\nCurrent Reading: 12,910 kWh\nTotal Usage: 565 kWh\n\nCurrent Charge:\nElectricity Charge: $0.12/kWh\nTotal Electricity Cost: $67.80\n\nOther Charges:\n- Environment Fee: $4.50\n- Service Connection Fee: $15.00\n\nTotal Amount Due: $87.30\n\n---------------------------------------------------------------------\n\nPayment Due Date: July 26, 1995\n\nPayment Methods:\n- By Phone: Call 1-800-123-5678 with your account number\n- Online: Log in to your account at www.sunpowerutilities.net\n- Mobile App: Available on iOS and Android\n- In-Person: Visit our office at 123 Solar Street, Lake Davidton\n- Mail: Use the return envelope enclosed with this bill\n\nNote: Late fees apply if payment is not received by the due date.\n\nThank you for using Sunshine Power & Utilities Co.!\n\nPlease remember to conserve energy:\n- Turn off lights when not in use.\n- Use energy-efficient appliances.\n- Set your thermostat wisely.\n\n---------------------------------------------------------------------\n\nVisit our website for tips on energy conservation and to learn more\nabout our renewable energy projects!\n---------------------------------------------------------------------\n```\n"},{"content":"{\"fields_to_redact\":[{\"string\":\"support@sunpowerutilities.net\",\"pii_type\":\"email_address\"},{\"string\":\"2049-987-231\",\"pii_type\":\"personal_id\"},{\"string\":\"1995-07-11\",\"pii_type\":\"date\"},{\"string\":\"Mtro. Catalina Olvera\",\"pii_type\":\"person_name\"},{\"string\":\"749 Danielle Lakes Apt. 791\\n Lake Davidton, NU E3L 3V2\",\"pii_type\":\"street_address\"},{\"string\":\"123 Solar Street, Lake Davidton\",\"pii_type\":\"street_address\"},{\"string\":\"1995\",\"pii_type\":\"date\"},{\"string\":\"1995\",\"pii_type\":\"date\"},{\"string\":\"July 26, 1995\",\"pii_type\":\"date\"}]}","refusal":null,"role":"assistant"}],"rejected_message":{"content":"{\"fields_to_redact\":[{\"string\":\"support@sunpowerutilities.net\",\"pii_type\":\"email_address\"},{\"string\":\"Mtro. Catalina Olvera\",\"pii_type\":\"person_name\"},{\"string\":\"749 Danielle Lakes Apt. 791\\n Lake Davidton, NU E3L 3V2\",\"pii_type\":\"street_address\"},{\"string\":\"2049-987-231\",\"pii_type\":\"personal_id\"},{\"string\":\"1995-07-11\",\"pii_type\":\"date\"},{\"string\":\"June 01, 1995 - June 30, 1995\",\"pii_type\":\"date\"},{\"string\":\"July 26, 1995\",\"pii_type\":\"date\"}]}","refusal":null,"role":"assistant","tool_calls":[]},"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a rental_agreement. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"**RESIDENTIAL RENTAL AGREEMENT**\n\n**This Residential Lease Agreement (\"Agreement\") is entered into on the 16th day of April, 1983, by and between Patel Inc (\"Landlord\") and Brian Smith (\"Tenant\").**\n\n**Landlord:**\nPatel Inc \nRegistered Office: 47 Zenith Plaza \nCorporate ID: ALPHA-83920\n\n**Tenant:**\nBrian Smith \nContact Number: +44(0)292018253 \nCurrent Address: Calzada Argentina 247 Edif. 025, Depto. 150 \nVieja Namibia, BCS 32896-6646\n\n**1. TERM:**\nThe term of this lease shall commence on the 1st day of May, 1983, and shall continue on a month-to-month basis unless terminated as provided herein.\n\n**2. RENT:**\nTenant agrees to pay Landlord a monthly rent of £850, due on the 1st day of each month. Payments shall be made by bank transfer to Landlord's designated account.\n\n**3. SECURITY DEPOSIT:**\nA security deposit of £1,000 is due at the signing of this Agreement and is refundable upon lease termination, subject to the terms herein.\n\n**4. PREMISES:**\nThe Landlord leases to the Tenant the residential premises located at Calzada Argentina 247 Edif. 025, Depto. 150, Vieja Namibia, BCS 32896-6646.\n\n**5. UTILITIES:**\nTenant will be responsible for all utilities and services incurred at the premises except for water and maintenance, which shall be paid by the Landlord.\n\n**6. MAINTENANCE AND REPAIRS:**\nTenant shall maintain the premises in a clean and sanitary condition. Tenant shall be responsible for any repairs due to negligence or misuse, excluding normal wear and tear.\n\n**7. ALTERATIONS:**\nTenant shall not make any alterations or additions to the premises without the prior written consent of Landlord. Any improvements will become the property of the Landlord.\n\n**8. ACCESS:**\nLandlord reserves the right to enter the premises for inspection, repairs, or showings, provided that 24 hours' notice is given, except in emergency situations.\n\n**9. TERMINATION:**\nEither party may terminate this Agreement by providing a 30-day written notice. Landlord may terminate immediately if Tenant violates any terms of this Agreement.\n\n**Tenant Signatory**\n\n_________________________ \nBrian Smith\n\n**Landlord Signatory**\n\n_________________________ \nAuthorized Representative \nPatel Inc\n\n**Witnesses:**\n\n1. _________________________ Date: ___________\n\n2. _________________________ Date: ___________\n\n**This document constitutes the entire agreement between the parties and supersedes all previous representations or agreements.**"},{"content":"{\"fields_to_redact\":[{\"string\":\"April, 1983\",\"pii_type\":\"date\"},{\"string\":\"Brian Smith\",\"pii_type\":\"person_name\"},{\"string\":\"+44(0)292018253\",\"pii_type\":\"phone_number\"},{\"string\":\"Calzada Argentina 247 Edif. 025, Depto. 150, Vieja Namibia, BCS 32896-6646\",\"pii_type\":\"street_address\"},{\"string\":\"May, 1983\",\"pii_type\":\"date\"},{\"string\":\"ALPHA-83920\",\"pii_type\":\"other_id\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a rental_agreement. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"**Residental Lease Agreement**\n\n**This Agreement is made on the 28th day of February, 2003**\n\n**BETWEEN**\n\nLessor: Hillcrest Property Management \nAddress: 345 Sunrise Avenue, Alexbury, LN3 9UP \nAND \nLessee: Jennifer Payne \nPersonal ID: 14698103794 \nAddress: 49 Collins Fall, Alexbury, LN2 8WR \n\n**TERMS OF RENTAL**\n\n1. **Property Description:** \n The property is a 2-bedroom apartment located at 49 Collins Fall, Alexbury, LN2 8WR. \n\n2. **Term of Lease:** \n The lease commences on March 1, 2003, and ends on February 28, 2004. Upon expiration, the lease may be extended or renewed by mutual agreement of both parties.\n\n3. **Rent:** \n The monthly rent shall be £850, payable in advance on the first day of each month to Hillcrest Property Management at the address provided above.\n\n4. **Security Deposit:** \n A security deposit of £850 shall be provided by the Lessee upon signing of this Agreement. This deposit will be held as security for any damages caused to the property or failure to fulfill the terms of the lease.\n\n5. **Utilities:** \n The Lessee agrees to assume responsibility for all utility services such as water, gas, electricity, internet and any other relevant utilities, starting from the commencement date of the lease.\n\n6. **Agreement of Use:** \n The premises shall only be used for residential purposes and occupied by the Lessee. No subletting or assignment of the lease is permitted without prior written consent from the Lessor.\n\n7. **Inspection:** \n The Lessor reserves the right to inspect the premises upon giving prior reasonable notice to ensure that the terms of the agreement are being upheld.\n\n8. **Maintenance and Repairs:** \n The Lessee must maintain the premises in good condition and report any required repairs to the Lessor in a timely manner. The Lessee shall be liable for costs due to negligence or excessive damage.\n\n9. **Termination of Lease:** \n Either party may terminate this lease upon 30-day written notice, with reasons being either breach of agreement or mutual desire to end tenancy.\n\n10. **Notices:** \n All notices under this lease shall be in writing and sent to the addresses mentioned herein.\n\n**Signatures:** \n\nLessor: _____________________ Date: _____________ \nLessee, Jennifer Payne: _____________________ Date: 2003-02-28 \n\n---\n\n*Note: This Lease Agreement is a legally binding contract. Ensure you have read and understood all terms and conditions before signing.*"},{"content":"{\"fields_to_redact\":[{\"string\":\"February, 2003\",\"pii_type\":\"date\"},{\"string\":\"Jennifer Payne\",\"pii_type\":\"person_name\"},{\"string\":\"14698103794\",\"pii_type\":\"personal_id\"},{\"string\":\"49 Collins Fall, Alexbury, LN2 8WR\",\"pii_type\":\"street_address\"},{\"string\":\"March 1, 2003\",\"pii_type\":\"date\"},{\"string\":\"February 28, 2004\",\"pii_type\":\"date\"},{\"string\":\"Jennifer Payne\",\"pii_type\":\"person_name\"},{\"string\":\"2003-02-28\",\"pii_type\":\"date\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a support_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Urgent Assistance Needed with Account Issue\n\nDate: September 14, 1990\n\nFrom: jonathan31@example.com\n\nTo: support@examplebank.com\n\nDear Customer Support Team,\n\nMy name is Magdalena Ernesto Muñiz, and I am reaching out regarding some issues I've encountered with my banking account. I hope you can assist me with this matter as soon as possible.\n\nFirstly, let me provide you with some details for verification. My personal ID is 746-39-3991, and the associated banking number is SCIH11262643243763. I can be contacted directly at my phone number, +33 2 49 68 93 16, if a call is necessary.\n\nThe issue began on September 10th when I noticed an unauthorized transaction on my statement. This transaction did not align with my usual activity, which is particularly concerning. As a practicing Christian, I strive for integrity and transparency in all my dealings, and I have always maintained a stringent check on my account activities.\n\nI would greatly appreciate your immediate attention to this matter, as I rely heavily on my account for daily transactions. Please let me know what further information you may need from my end to expedite the resolution process.\n\nThank you for your time and consideration.\n\nWarm regards,\n\nMagdalena Ernesto Muñiz"},{"content":"{\"fields_to_redact\":[{\"string\":\"September 14, 1990\",\"pii_type\":\"date\"},{\"string\":\"jonathan31@example.com\",\"pii_type\":\"email_address\"},{\"string\":\"support@examplebank.com\",\"pii_type\":\"domain_name\"},{\"string\":\"Magdalena Ernesto Muñiz\",\"pii_type\":\"person_name\"},{\"string\":\"746-39-3991\",\"pii_type\":\"personal_id\"},{\"string\":\"SCIH11262643243763\",\"pii_type\":\"banking_number\"},{\"string\":\"+33 2 49 68 93 16\",\"pii_type\":\"phone_number\"},{\"string\":\"September 10th\",\"pii_type\":\"date\"},{\"string\":\"Christian\",\"pii_type\":\"religious_affiliation\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a bank_statement. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"```\nBANK STATEMENT\n\nAccount Holder: Todd Hall\nStreet Address: Callejón Sur Barela 853 Edif. 901, Depto. 743\n Nueva Guatemala, TAMPS 97966-1870\nPhone Number: 293.243.3118x648\nEmail Address: taylorgavin@example.com\n\nBanking Number: JRIC1678435261005\n\nStatement Date: 2010-05-23\n\n================================================================\nAccount Summary\n----------------------------------------------------------------\nBeginning Balance: $8,572.45\n\nTransactions:\n2010-05-01 Grocery Store -$123.49\n2010-05-04 Online Shopping -$75.00\n2010-05-09 Utility Bill Payment -$98.76\n2010-05-12 Salary Credit +$2,500.00\n2010-05-15 Coffee Shop -$6.75\n2010-05-20 Dining Out -$57.80\n\nTotal Deposits and Credits: +$2,500.00\nTotal Withdrawals and Debits: -$361.80\n\nEnding Balance: $10,710.65\n\n================================================================\nImportant Notes:\nPlease review your transactions carefully and notify us of any discrepancies within 30 days. For inquiries, contact our customer service at 1-800-555-0123, available 24/7.\n\nRemember to keep your banking number JRIC1678435261005 confidential to ensure the security of your account.\n\nThank you for banking with us!\n\n```"},{"content":"{\"fields_to_redact\":[{\"string\":\"Todd Hall\",\"pii_type\":\"person_name\"},{\"string\":\"Callejón Sur Barela 853 Edif. 901, Depto. 743\\n Nueva Guatemala, TAMPS 97966-1870\",\"pii_type\":\"street_address\"},{\"string\":\"293.243.3118x648\",\"pii_type\":\"phone_number\"},{\"string\":\"taylorgavin@example.com\",\"pii_type\":\"email_address\"},{\"string\":\"JRIC1678435261005\",\"pii_type\":\"banking_number\"},{\"string\":\"2010-05-23\",\"pii_type\":\"date\"},{\"string\":\"2010-05-01\",\"pii_type\":\"date\"},{\"string\":\"2010-05-04\",\"pii_type\":\"date\"},{\"string\":\"2010-05-09\",\"pii_type\":\"date\"},{\"string\":\"2010-05-12\",\"pii_type\":\"date\"},{\"string\":\"2010-05-15\",\"pii_type\":\"date\"},{\"string\":\"2010-05-20\",\"pii_type\":\"date\"},{\"string\":\"1-800-555-0123\",\"pii_type\":\"phone_number\"},{\"string\":\"JRIC1678435261005\",\"pii_type\":\"banking_number\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a support_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Urgent Assistance Required for Account Access\n\nDate: 2013-07-19\n\nFrom: kayla11@example.net\n\nTo: support@hussain.com\n\nDear Hussain.com Support Team,\n\nI hope this message finds you well. My name is Melissa Vang, and I am reaching out to request immediate assistance with accessing my online account. I've been experiencing difficulties logging in over the last few days, and my attempts to reset my password have been unsuccessful.\n\nAdditionally, I have noticed some suspicious activity on my account statements and am concerned about potential unauthorized access. I believe this issue might be related to my recent banking transactions. For reference, my banking number is DVWI08936336986317.\n\nCould you please help me regain access and investigate any potential breaches? I am worried about the security of my sensitive financial information. Let me know if you need further details or identification verification to resolve this matter swiftly.\n\nThank you for your prompt attention to this urgent issue. Looking forward to your response.\n\nBest regards,\n\nMelissa Vang"},{"content":"{\"fields_to_redact\":[{\"string\":\"2013-07-19\",\"pii_type\":\"date\"},{\"string\":\"kayla11@example.net\",\"pii_type\":\"email_address\"},{\"string\":\"support@hussain.com\",\"pii_type\":\"email_address\"},{\"string\":\"Melissa Vang\",\"pii_type\":\"person_name\"},{\"string\":\"DVWI08936336986317\",\"pii_type\":\"banking_number\"},{\"string\":\"Melissa Vang\",\"pii_type\":\"person_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a utility_bill. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"```\nElectricity Provider: SparkPower Utilities\nBilling Date: 2004-05-30\nAccount Number: 8945-3320-5567\n\nBill To:\nDavid Horton\nUrbanización de Fito Jimenez 54\nPontevedra, 33266\nSpain\n\nPersonal ID: ZZ 25 80 78 T\n\nService Plan: Standard Residential\nMeter Number: 9876543210\n\nBilling Period: 2004-04-01 to 2004-04-30\nTotal Usage: 350 kWh\n\nItemized Charges:\n- Basic Service Fee: €25.00\n- Energy Charge: 350 kWh x €0.14 = €49.00\n- Renewable Energy Fee: €5.00\n- Local Tax: €3.25\nTotal Amount Due: €82.25\n\nPayment Due By: 2004-06-15\n\nQuestions or Concerns?\nContact our customer service at 1-800-555-ENERGY or visit www.sparkpower.es\n\nThank you for choosing SparkPower Utilities. Save energy, live better!\n```"},{"content":"{\"fields_to_redact\":[{\"string\":\"2004-05-30\",\"pii_type\":\"date\"},{\"string\":\"David Horton\",\"pii_type\":\"person_name\"},{\"string\":\"Urbanización de Fito Jimenez 54\\nPontevedra, 33266\\nSpain\",\"pii_type\":\"street_address\"},{\"string\":\"ZZ 25 80 78 T\",\"pii_type\":\"personal_id\"},{\"string\":\"2004-04-01\",\"pii_type\":\"date\"},{\"string\":\"2004-04-30\",\"pii_type\":\"date\"},{\"string\":\"www.sparkpower.es\",\"pii_type\":\"domain_name\"},{\"string\":\"2004-06-15\",\"pii_type\":\"date\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a bank_statement. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"```plaintext\nModern Bank Corporation \n1328 Financial Plaza \nCapital City, NY 10001\n\nStatement Date: January 18, 2004 \nAccount Holder: Noël de la Mace \nAccount Number: JSHP19224370051243\n\nBilling Address:\n1334 Sean Mountain Apt. 250 \nLake Joshuaberg, ND 25466\n\nContact Number: 383-692-1810\n\n--------------------------------------------------------------------\nTRANSACTION HISTORY\n--------------------------------------------------------------------\nDate Description Amount Balance\n--------------------------------------------------------------------\n01/01/2004 Direct Deposit - Salary +3,500.00 16,250.00 \n01/03/2004 Grocery Store Purchase - 145.37 16,104.63 \n01/05/2004 Online Subscription - 12.99 16,091.64 \n01/07/2004 Transfer to Savings (Acc. ****9245)- 1,000.00 15,091.64 \n01/10/2004 Utility Bill Payment - 208.56 14,883.08 \n01/12/2004 Coffee Shop Purchase - 6.57 14,876.51 \n01/15/2004 Insurance Premium - 210.00 14,666.51 \n01/17/2004 Movie Theater Payment - 30.00 14,636.51 \n\n--------------------------------------------------------------------\nDAILY INTEREST\nTotal Checks Paid This Period: $0.00\nTotal Deposits/Credits This Period: $3,500.00\nMinimum Balance This Period: $13,736.51\n\nINTEREST - ANNUAL PERCENTAGE YIELD EARNED: 0.01%\nAMOUNT EARNED THIS PERIOD: $0.35\n\nThank you for banking with us. For assistance, contact our customer service at (888)555-0199 or visit your local branch.\n\n*** END OF STATEMENT ***\n\n```\n"},{"content":"{\"fields_to_redact\":[{\"string\":\"January 18, 2004\",\"pii_type\":\"date\"},{\"string\":\"Noël de la Mace\",\"pii_type\":\"person_name\"},{\"string\":\"JSHP19224370051243\",\"pii_type\":\"banking_number\"},{\"string\":\"1334 Sean Mountain Apt. 250\",\"pii_type\":\"street_address\"},{\"string\":\"Lake Joshuaberg, ND 25466\",\"pii_type\":\"street_address\"},{\"string\":\"383-692-1810\",\"pii_type\":\"phone_number\"},{\"string\":\"01/01/2004\",\"pii_type\":\"date\"},{\"string\":\"01/03/2004\",\"pii_type\":\"date\"},{\"string\":\"01/05/2004\",\"pii_type\":\"date\"},{\"string\":\"01/07/2004\",\"pii_type\":\"date\"},{\"string\":\"01/10/2004\",\"pii_type\":\"date\"},{\"string\":\"01/12/2004\",\"pii_type\":\"date\"},{\"string\":\"01/15/2004\",\"pii_type\":\"date\"},{\"string\":\"01/17/2004\",\"pii_type\":\"date\"},{\"string\":\"(888)555-0199\",\"pii_type\":\"phone_number\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a company_memo. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"---\n\nTo: All Employees \nFrom: Marc King, Chief Operations Officer \nDate: May 18, 1995 \nSubject: Formation of Cross-Departmental Task Force & New Communication Protocol \n\nDear Ferreira Team,\n\nI am pleased to announce the establishment of a new cross-departmental task force within our organization, designed to enhance collaboration and innovation across the board. As many of you may know, our company's continuous success heavily relies on our ability to adapt and respond effectively to the ever-evolving market dynamics. With this initiative, we aim to harness the collective expertise of all our departments to drive ground-breaking projects and problem-solving strategies.\n\n**Purpose of the Task Force:**\nThe main objective of this task force will be to identify opportunities for strategic improvements, foster a collaborative work environment, and spearhead new projects that align with our company's mission and goals.\n\n**Members:**\nThe team will include representatives from all major departments including Marketing, Product Development, Customer Relations, and Human Resources. Each member was selected for their exceptional skills and experience in driving strategic initiatives.\n\n**First Meeting Details:**\n- Date: May 25, 1995\n- Time: 10:00 AM\n- Location: Conference Room B, Main Building\n\nPlease mark your calendars accordingly.\n\n**New Communication Protocol:**\nIn conjunction with the formation of the task force, we are implementing a streamlined communication protocol to ensure all initiatives and updates flow seamlessly across the organization. Effective June 1, 1995, we will be using a centralized intranet system for all internal communications related to task force activities. More details about accessing and navigating this system will be circulated shortly.\n\nFor any immediate concerns, or if you need further clarification regarding these initiatives, please do not hesitate to reach out to my office directly at 331-893-4575x2225. I am confident that with your support and enthusiasm, we will continue to elevate Ferreira to new heights of success.\n\nThank you for your commitment and dedication to our collective mission.\n\nWarm regards,\n\nMarc King \nChief Operations Officer \nFerreira \n\n---"},{"content":"{\"fields_to_redact\":[{\"string\":\"May 18, 1995\",\"pii_type\":\"date\"},{\"string\":\"May 25, 1995\",\"pii_type\":\"date\"},{\"string\":\"June 1, 1995\",\"pii_type\":\"date\"},{\"string\":\"331-893-4575x2225\",\"pii_type\":\"phone_number\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a support_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Urgent Assistance Required - Account Access Issue\n\nHi Dunn-Davis Support Team,\n\nI hope this message finds you well. My name is Cassie Garcia, and I am reaching out to you with an urgent request concerning my account access with your esteemed organization. I have been a member since joining at the age of 73, and while I truly appreciate the services Dunn-Davis provides, I am currently encountering some issues.\n\nHere’s what happened: On January 16, 2002, I attempted to log into my account using my standard credentials, but I faced unexpected access denial. For reference, my email address linked to the account is dyeremily@example.com, and my personal ID associated with your system is ZZ 046932 T. Moreover, I use the secure credential Sn3g8Uee$8 which has always worked perfectly until now.\n\nI also tried to reach out via phone, but I haven't been successful in connecting with a representative. My contact number is 6538182868, should you need to call back for more information.\n\nI kindly urge your immediate attention to this matter as I require access to important documents held within my account. Please let me know whatever additional information you might need from me to expedite the resolution process. I appreciate your prompt response and assistance.\n\nThank you in advance for your attention to this issue.\n\nWarm regards,\n\nCassie Garcia\nDunn-Davis Member since 2002"},{"content":"{\"fields_to_redact\":[{\"string\":\"Cassie Garcia\",\"pii_type\":\"person_name\"},{\"string\":\"73\",\"pii_type\":\"age\"},{\"string\":\"January 16, 2002\",\"pii_type\":\"date\"},{\"string\":\"dyeremily@example.com\",\"pii_type\":\"email_address\"},{\"string\":\"ZZ 046932 T\",\"pii_type\":\"personal_id\"},{\"string\":\"Sn3g8Uee$8\",\"pii_type\":\"secure_credential\"},{\"string\":\"6538182868\",\"pii_type\":\"phone_number\"},{\"string\":\"Cassie Garcia\",\"pii_type\":\"person_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a utility_bill. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"```\nPEREGRINE ENERGY SOLUTIONS\nCustomer Account Statement\n\n\nAccount Holder: Lauren Whitaker\nAccount Number: 0981635921\n\nBilling Period: 2023-08-01 to 2023-08-31\nIssue Date: 1974-08-20\nDue Date: 2023-09-15\n\nService Address:\n552 Eric Summit\nMortonport, PR 74503\n\nSummary of Charges:\n--------------------------------\nElectricity Supply Service:\nUsage: 450 kWh\nRate: $0.12 per kWh\nCharge: $54.00\n\nDelivery and Service Charges:\nBasic Service Charge: $8.50\nTransmission Charge: $5.25\nDistribution Charge: $7.20\n\nAdjustments:\nRenewable Energy Credit: -$3.00\n\nMiscellaneous:\nState Energy Program Fee: $1.75\nSales Tax (5.4%): $3.82\n--------------------------------\nTotal Amount Due: $77.52 USD\n\nThank you for choosing Peregrine Energy Solutions!\n \nPayment Options:\n- Online: www.peregrineenergy.com/paybill\n- Phone: Call 1-800-555-ENERGY (Mon-Fri, 8 AM to 8 PM)\n- Mail: Use the enclosed envelope to send a check or money order to:\n Peregrine Energy Solutions\n P.O. Box 21567\n Mortonport, PR 74501-1567\n\nFor questions regarding your bill or service, please contact our customer service department at the phone number provided above.\n\nEnergy savings tip of the month:\n\"Maximize your home's efficiency by switching to LED lighting, which uses about 75% less energy than traditional incandescent bulbs.\"\n```"},{"content":"{\"fields_to_redact\":[{\"string\":\"Lauren Whitaker\",\"pii_type\":\"person_name\"},{\"string\":\"0981635921\",\"pii_type\":\"personal_id\"},{\"string\":\"1974-08-20\",\"pii_type\":\"date\"},{\"string\":\"552 Eric Summit\\nMortonport, PR 74503\",\"pii_type\":\"street_address\"},{\"string\":\"1-800-555-ENERGY\",\"pii_type\":\"phone_number\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a company_memo. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"---\n\n**Tucker LLC**\n**Internal Memo**\n\n**To:** All Staff \n**From:** Lauren Wise, HR Manager \n**Date:** October 11, 1972 \n**Subject:** Team Bonding Retreat and Policy Updates\n\n---\n\nDear Team,\n\nI hope this message finds you well and thriving in your respective roles. As we approach the end of the quarter, I would like to extend my heartfelt appreciation for the tremendous work each one of you has contributed to the ongoing success of Tucker LLC. Our fiscal year has been challenging, yet your unwavering dedication has led to significant milestones in our journey.\n\n**Upcoming Retreat:**\n\nIn light of our achievements and to foster stronger relationships across departments, we are organizing a **Team Bonding Retreat**. \n\n**Date:** November 18, 1972 \n**Location:** Pine Crest Lodge, Blue Ridge Mountains \n\nThis all-expenses-paid retreat will include workshops, recreational activities, and a gala dinner. More details will follow soon. Please ensure your availability as it will be an excellent opportunity for personal and professional growth.\n\n**Policy Updates:**\n\nAs part of our ongoing efforts to streamline operations, please be advised of the following policy updates effective immediately:\n\n1. **Remote Work Amendment**: Given the evolving landscape of work flexibility, we are pleased to offer an optional remote work arrangement for up to two days a week.\n\n2. **Dress Code Revision**: Professional attire is essential, promoting our ethos of excellence; however, we recognize the need for comfort. Consequently, business casual attire is now deemed suitable from Monday to Thursday, with casual Fridays continuing as usual.\n\n3. **Health and Wellness Initiatives**: In alignment with our commitment to employee well-being, we are introducing a monthly wellness program offering fitness classes and mental health seminars. Participation is strongly encouraged.\n\nYour feedback and continuous improvement within the company are invaluable. Do not hesitate to reach out with any questions or suggestions regarding these updates. Let's keep the communication channels open.\n\nThank you once again for your hard work and dedication. Together, we will make Tucker LLC not only a leader in the industry but also a congenial workplace.\n\nWarm regards,\n\nLauren Wise \nHR Manager \nTucker LLC "},{"content":"{\"fields_to_redact\":[{\"string\":\"October 11, 1972\",\"pii_type\":\"date\"},{\"string\":\"November 18, 1972\",\"pii_type\":\"date\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a company_memo. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"To: All Staff \nFrom: Christine Boulay \nDate: June 21, 2009 \nSubject: Upcoming Changes in Leadership \n\nDear Team,\n\nI hope this memo finds you well. I am writing to inform you of some upcoming leadership changes within our organization, Mcbride-Watson, that will be effective immediately.\n\nAs part of our continuous effort to enhance our operations and strategic direction, we have appointed a new Vice President of Operations. This change aims to support our growing business needs and strengths, empowering our team to achieve new heights.\n\nI would like to take this opportunity to thank all of you for your unwavering dedication and hard work. With our collective efforts and the fresh perspective brought in by our new leadership, I am confident that Mcbride-Watson will continue to flourish and achieve its goals.\n\nIn other related news, I am delighted to announce that Mcbride-Watson recently secured a partnership with GreenTech Innovations—a milestone that promises to enhance our sustainability efforts. This collaboration underscores our commitment to innovation and environmentally friendly practices.\n\nPlease join me in welcoming our new leadership and embracing the exciting opportunities ahead.\n\nThank you for your continued support and enthusiasm.\n\nWarm regards,\n\nChristine Boulay \nDirector of Human Resources \nMcbride-Watson"},{"content":"{\"fields_to_redact\":[{\"string\":\"June 21, 2009\",\"pii_type\":\"date\"},{\"string\":\"Mcbride-Watson\",\"pii_type\":\"organization_name\"},{\"string\":\"Christine Boulay\",\"pii_type\":\"person_name\"},{\"string\":\"Mcbride-Watson\",\"pii_type\":\"organization_name\"},{\"string\":\"Mcbride-Watson\",\"pii_type\":\"organization_name\"},{\"string\":\"GreenTech Innovations\",\"pii_type\":\"organization_name\"},{\"string\":\"Christine Boulay\",\"pii_type\":\"person_name\"},{\"string\":\"Mcbride-Watson\",\"pii_type\":\"organization_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a support_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Urgent Assistance Required: Account Issue\n\nDear Support Team,\n\nI hope this message finds you well. My name is Stanley Jones, and I am reaching out to resolve an urgent issue I am currently experiencing with my account. \n\nAs a matter of introduction, allow me to provide some pertinent details:\n- Nationality: Francia\n- Full Name: Stanley Jones\n- Email Address: velezolga@example.net\n- Banking Number: QHWI20068244467577\n- Street Address: PSC 1754, Box 2882, APO AE 23242\n\nI have encountered a problem accessing my online banking portal, which seems to stem from an error message indicating a possible breach or unauthorized activity. This has caused great concern as I need to manage my financial transactions and keep track of my expenses efficiently.\n\nI kindly request your immediate assistance in resolving this matter. Any confirmation on successful troubleshooting or next steps would be greatly appreciated. Please let me know if you require any further information from my end to expedite this process.\n\nThank you for your prompt attention to this urgent issue. I look forward to your swift response.\n\nWarm regards,\n\nStanley Jones"},{"content":"{\"fields_to_redact\":[{\"string\":\"Stanley Jones\",\"pii_type\":\"person_name\"},{\"string\":\"Francia\",\"pii_type\":\"nationality\"},{\"string\":\"Stanley Jones\",\"pii_type\":\"person_name\"},{\"string\":\"velezolga@example.net\",\"pii_type\":\"email_address\"},{\"string\":\"QHWI20068244467577\",\"pii_type\":\"banking_number\"},{\"string\":\"PSC 1754, Box 2882, APO AE 23242\",\"pii_type\":\"street_address\"},{\"string\":\"Stanley Jones\",\"pii_type\":\"person_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a personal_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Catching Up Soon?\n\nHi Regina,\n\nI hope this email finds you well! It's been far too long since we last chatted. I've been meaning to catch up and see how everything is going on your end. Life in the city has been hectic, but things are finally starting to calm down. \n\nHow’s work treating you these days? Hope you’re not swamped with projects. Also, do you still have plans to visit the new art exhibition downtown? I’ve heard it’s amazing and thought it might be a good idea for us to check it out together. Let me know what you think!\n\nBy the way, I had some issues with my emails recently. If you’re ever in doubt, just reach out directly to me at my other email: xphillips@example.net. Always happy to hear from you!\n\nTake care and talk soon.\n\nBest,\nAlex"},{"content":"{\"fields_to_redact\":[{\"string\":\"xphillips@example.net\",\"pii_type\":\"email_address\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a support_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Assistance Required with a Recent Software Upgrade\n\nDate: March 10, 1989\n\nDear Support Team,\n\nMy name is Sierra Keller, and I am reaching out to seek help with a technical issue I've encountered after a recent software upgrade on my workstation. I understand from other users in my demographic group, White, that this can be somewhat common, but I am hoping for a swift resolution.\n\nHere's the situation: after updating the system yesterday, Monday, the software just isn't functioning as smoothly as before. The graphics seem pixelated and the application occasionally crashes when I try to perform multi-tasking operations. Considering my date of birth is April 24, 1985, technological transitions like this can be a bit daunting, so your guidance would be greatly appreciated.\n\nI have already attempted rebooting the system and reinstalling the software, but the issues persist. It would be helpful to know if I need to adjust any specific settings or perhaps reinstall any additional components.\n\nPlease contact me at loriwilliams@example.com at your earliest convenience. I am confident your expertise will help resolve this matter promptly.\n\nThank you for your attention to this issue.\n\nBest regards,\n\nSierra Keller"},{"content":"{\"fields_to_redact\":[{\"string\":\"March 10, 1989\",\"pii_type\":\"date\"},{\"string\":\"Sierra Keller\",\"pii_type\":\"person_name\"},{\"string\":\"White\",\"pii_type\":\"demographic_group\"},{\"string\":\"April 24, 1985\",\"pii_type\":\"date_of_birth\"},{\"string\":\"loriwilliams@example.com\",\"pii_type\":\"email_address\"},{\"string\":\"Sierra Keller\",\"pii_type\":\"person_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a support_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Urgent Assistance Required for Account Verification\n\nHi Support Team,\n\nI hope this message finds you well. My name is Mark Sutton, and I am reaching out to seek your help with an urgent issue regarding my recent transactions on your platform.\n\nI am having trouble accessing my account on foster.biz. Despite several attempts, I’ve been unable to verify my banking number. I suspect it might be due to an error in the personal details linked to my account. For verification, here are my details:\n\n- **Name:** Mark Sutton\n- **Age:** 83\n- **Email:** pcontreras@example.com\n- **Date of Birth:** 2000-06-26\n\nThe banking number associated with my account is ZFNY4117402662110. Please let me know if any further information is required to resolve this issue. \n\nYour prompt assistance in helping me restore access to my account is highly appreciated.\n\nThank you for your immediate attention to this matter.\n\nWarm regards,\n\nMark Sutton"},{"content":"{\"fields_to_redact\":[{\"string\":\"Mark Sutton\",\"pii_type\":\"person_name\"},{\"string\":\"foster.biz\",\"pii_type\":\"domain_name\"},{\"string\":\"83\",\"pii_type\":\"age\"},{\"string\":\"pcontreras@example.com\",\"pii_type\":\"email_address\"},{\"string\":\"2000-06-26\",\"pii_type\":\"date_of_birth\"},{\"string\":\"ZFNY4117402662110\",\"pii_type\":\"banking_number\"},{\"string\":\"Mark Sutton\",\"pii_type\":\"person_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a medical_record. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"**Patient Medical Record**\n\n---\n\n**Patient Information:**\n\n- **Name:** George Riley \n- **Date of Birth:** July 2, 2004 \n- **Age:** 18 \n- **Gender:** Female \n- **Personal ID:** 191-85-8091 \n- **Contact Number:** 1-859-850-7840x8870 \n\n---\n\n**Medical History:**\n\n- **Current Medical Condition:** \n - **Primary Diagnosis:** Arrhythmia \n - **Description:** The patient exhibits abnormal heart rhythms which may be sporadic, consistently fast, or irregular. This condition requires regular monitoring and potentially investigatory procedures to determine the underlying cause and appropriate treatment plan.\n\n- **Previous Medical Concerns:** \n - None reported at this time.\n\n- **Family Medical History:** \n No immediate family history of cardiovascular disorders has been recorded, although maternal grandparents experienced hypertension and late-onset Type II diabetes.\n\n---\n\n**Current Medications:**\n\n1. **Metoprolol:** 25mg, taken twice daily to manage heart rate and blood pressure, particularly considering the Arrhythmia.\n2. **Aspirin:** 81mg, daily to reduce the risk of blood clots.\n\n---\n\n**Allergies:**\n\n- Penicillin: Causes mild skin rash and itching.\n- No known food or environmental allergies reported.\n\n---\n\n**Lifestyle and Habits:**\n\n- **Dietary Habits:** Balanced diet with controlled sodium intake.\n- **Exercise Routine:** Participates in calm, regular walking and yoga sessions to manage stress and improve cardiovascular health.\n- **Smoking Status:** Non-smoker.\n- **Alcohol Consumption:** Occasional consumption; advised moderation.\n\n---\n\n**Recent Consultations and Tests:**\n\n- **Last Reviewed:** September 15, 2023\n- **Previous ECG Results:** Indicated sporadic irregular heartbeats; follow-up consultation recommended.\n- **Upcoming Tests:** Scheduled for a Holter Monitor assessment in November 2023 to continuously monitor heart activity for 24-48 hours.\n\n---\n\n**Doctor's Notes:**\n\n- Patient exhibits a conscientious attitude towards health management and adheres well to prescribed treatments. Needs to maintain regular appointments for condition assessment.\n- Psychologically coping well with the diagnosis, demonstrates an optimistic outlook and engages actively in prescribed lifestyle modifications.\n\n*End of Record*"},{"content":"{\"fields_to_redact\":[{\"string\":\"George Riley\",\"pii_type\":\"person_name\"},{\"string\":\"July 2, 2004\",\"pii_type\":\"date_of_birth\"},{\"string\":\"18\",\"pii_type\":\"age\"},{\"string\":\"Female\",\"pii_type\":\"gender\"},{\"string\":\"191-85-8091\",\"pii_type\":\"personal_id\"},{\"string\":\"1-859-850-7840x8870\",\"pii_type\":\"phone_number\"},{\"string\":\"Arrhythmia\",\"pii_type\":\"medical_condition\"},{\"string\":\"September 15, 2023\",\"pii_type\":\"date\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a personal_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Adventure Awaits! 🌟\n\nHi Lisa,\n\nI hope this email finds you well! It's been way too long since our last catch-up, hasn't it? I've been thinking about that hiking adventure we talked about and I believe it's about time we make it happen. Fall is such a beautiful season for the trails, and I can already picture the vibrant leaves creating a stunning tapestry around us.\n\nLet me know if you're still up for exploring the Appalachian Trail. I’ve been doing some research and found this cozy cabin we could rent right near the park entrance. Maybe we can go in the last week of October when the foliage is at its peak? It's the perfect escape before we get wrapped up in holiday madness.\n\nAlso, before I forget, I stumbled upon an incredible outdoor gear shop during my last business trip. They have everything from heavy-duty hiking boots to compact camping gadgets. I think you’d love it. If you're interested, we could pop over there next weekend to check out their stuff and grab a hot cocoa or two. Perfect prep for an adventure, right? 😊\n\nAnyway, drop me a line at your convenience, lisa41@example.com, and let’s solidify some plans. The great outdoors and trails are calling our name, can't wait to share this experience with you!\n\nTake care, and talk soon!\n\nBest,\nDonald\n\nP.S. Also, if you come up with any other brilliant ideas for our getaway or any hidden gem spots, I'm all ears! Plus, don't forget your camera—it’ll be a photo op galore! 📸"},{"content":"{\"fields_to_redact\":[{\"string\":\"lisa41@example.com\",\"pii_type\":\"email_address\"},{\"string\":\"Donald\",\"pii_type\":\"person_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a personal_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: It's Been A While 😊\n\nHi there Jennifer Moore,\n\nI hope this email finds you well! It's been ages since we last caught up. How is everything going in your corner of the world?\n\nThe other day, I was going through some old photos and stumbled across that hilarious one from our trip to the coast. Remember the fish market incident? 😂 Good times! Anyway, I thought I'd drop you a line and say hello.\n\nBy the way, I came across an article on sustainable living that might interest you. If you're still keen on transforming that garden of yours at Pasaje Felicia Barroso 179 Puerta 0, Vizcaya, 48981 into a more eco-friendly space, let me know, and I can send it your way!\n\nAlso, if you need to reach me for any gardening tips or just to chat, drop me an email at qhardy@example.org. I'd love to hear all about the new projects you're working on.\n\nLooking forward to catching up soon. Take care and talk later!\n\nWarm regards,\nQuinn"},{"content":"{\"fields_to_redact\":[{\"string\":\"Jennifer Moore\",\"pii_type\":\"person_name\"},{\"string\":\"Pasaje Felicia Barroso 179 Puerta 0, Vizcaya, 48981\",\"pii_type\":\"street_address\"},{\"string\":\"qhardy@example.org\",\"pii_type\":\"email_address\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a support_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Urgent Technical Support Required\n\nDate: October 31, 1998\n\nFrom: Lynda Bailey \nTo: support@suministros.es\n\nDear Chapman-Leach Support Team,\n\nI hope this message finds you well. My name is Lynda Bailey, and I have been experiencing some technical difficulties with your service over the past few days. I would highly appreciate your immediate attention to resolve the issue at hand.\n\nAs a representative of our organization, Chapman-Leach, I rely heavily on uninterrupted service for our daily operations. Specifically, our team has encountered persistent access issues when attempting to use your online platform, which is crucial for our supply chain management.\n\nHere are the specific details of the problem:\n\n- **Account Holder:** Lynda Bailey\n- **Account ID:** (Please check internally)\n- **Contact Number:** 279-121-8623 x586\n- **Main Concerns:** Inability to access supplier database since October 29, 1998, multiple erroneous login attempts recorded despite correct credentials.\n\nAdditionally, while trying to troubleshoot the problem, I've come across a notice that our domain name, suministros.es, might be part of a scheduled maintenance window. However, there was no prior notification sent to my email address (melissalewis@example.org), and thus, we were unable to make any necessary preparations.\n\nFor my verification purposes, you may refer to my date of birth: October 5, 1970. I am aware of the importance of keeping sensitive information secure, hence all communications should be directed to my official email address.\n\nPlease prioritize this request as our operations are currently on hold. Awaiting your prompt response with a solution or an update on when this issue is expected to be resolved. Feel free to reach out to me at my contact number between 9 am - 6 pm for any clarifications needed.\n\nThank you for your immediate attention to this matter.\n\nBest regards,\n\nLynda Bailey\nChapman-Leach Operations \nEmail: melissalewis@example.org \nPhone: 279-121-8623 x586 "},{"content":"{\"fields_to_redact\":[{\"string\":\"October 31, 1998\",\"pii_type\":\"date\"},{\"string\":\"Lynda Bailey\",\"pii_type\":\"person_name\"},{\"string\":\"melissalewis@example.org\",\"pii_type\":\"email_address\"},{\"string\":\"Chapman-Leach\",\"pii_type\":\"organization_name\"},{\"string\":\"Lynda Bailey\",\"pii_type\":\"person_name\"},{\"string\":\"279-121-8623 x586\",\"pii_type\":\"phone_number\"},{\"string\":\"October 29, 1998\",\"pii_type\":\"date\"},{\"string\":\"suministros.es\",\"pii_type\":\"domain_name\"},{\"string\":\"melissalewis@example.org\",\"pii_type\":\"email_address\"},{\"string\":\"October 5, 1970\",\"pii_type\":\"date_of_birth\"},{\"string\":\"melissalewis@example.org\",\"pii_type\":\"email_address\"},{\"string\":\"Lynda Bailey\",\"pii_type\":\"person_name\"},{\"string\":\"Chapman-Leach\",\"pii_type\":\"organization_name\"},{\"string\":\"melissalewis@example.org\",\"pii_type\":\"email_address\"},{\"string\":\"279-121-8623 x586\",\"pii_type\":\"phone_number\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a rental_agreement. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"**RESIDENTIAL RENTAL AGREEMENT**\n\nThis Rental Agreement (\"Agreement\") is made and entered into this 6th day of March, 2004, by and between Sandra Davis (\"Landlord\") and Michelle Thompson (\"Tenant\").\n\n**1. Premises**\n\nLandlord rents to Tenant, and Tenant rents from Landlord, the residential property located at 8068 Anne Haven Apt. 109, Murrayland, WY 36122 (\"Premises\").\n\n**2. Term**\n\nThe term of this lease shall commence on March 6, 2004, and shall be on a month-to-month basis. Either party may terminate this Agreement by giving the other thirty (30) days' written notice prior to the termination date.\n\n**3. Rent**\n\nTenant agrees to pay the monthly rent of $950.00 due on or before the first day of each month during the term of this Agreement.\n\n**4. Security Deposit**\n\nA security deposit of $950.00 is required from Tenant, payable to Landlord upon execution of this Agreement, as security for any damages caused to the Premises during the term of this Agreement.\n\n**5. Utilities**\n\nTenant shall be responsible for all utilities including water, gas, electricity, and trash collection. Landlord is responsible for ensuring storm windows are installed and functional.\n\n**6. Maintenance and Repairs**\n\nTenant is responsible for maintaining the Premises in a neat and orderly manner and shall inform Landlord of any damages or needed repairs. Landlord will address necessary repairs in a timely manner.\n\n**7. Pet Policy**\n\nTenant may keep one pet, a cat or dog, with a pet deposit of $200.00, covering potential damage or additional cleaning costs.\n\n**8. Contact Information**\n\nTenant acknowledges that exchange of notices and communications can be facilitated via phone or in writing. Tenant’s phone number: 1-941-205-9644x35298.\n\n**IN WITNESS WHEREOF**, the parties have executed this Rental Agreement as of the date first above written.\n\n____________________________ \nSandra Davis, Landlord\n\n____________________________ \nMichelle Thompson, Tenant"},{"content":"{\"fields_to_redact\":[{\"string\":\"March 6, 2004\",\"pii_type\":\"date\"},{\"string\":\"8068 Anne Haven Apt. 109, Murrayland, WY 36122\",\"pii_type\":\"street_address\"},{\"string\":\"March 6, 2004\",\"pii_type\":\"date\"},{\"string\":\"1-941-205-9644x35298\",\"pii_type\":\"phone_number\"},{\"string\":\"Sandra Davis\",\"pii_type\":\"person_name\"},{\"string\":\"Michelle Thompson\",\"pii_type\":\"person_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a rental_agreement. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"\n\n\nRESIDENTIAL RENTAL AGREEMENT\n\nTHIS LEASE AGREEMENT (the \"Agreement\") made and entered into this 8th day of June, 1993, by and between Johnathan Properties, LLC, a Limited Liability Company organized and existing under the laws of the State of Iowa with its principal office located at 12 Birch Lane, Greerburgh, IA 19183 (hereinafter referred to as \"Landlord\") and April Chambers (hereinafter referred to as \"Tenant\").\n\n1. PREMISES: Landlord hereby leases to Tenant a single-family apartment located at 599 Edward Fields Suite 154, Greerburgh, IA 19183 (the \"Premises\").\n\n2. TERM: The term of this Lease shall commence on 8th June 1993 and shall continue for a period of twelve (12) months thereafter, unless terminated earlier in accordance with this Agreement.\n\n3. RENT: Tenant agrees to pay Landlord as rent for the Premises the sum of Nine Hundred and Fifty Dollars ($950.00) per month, on or before the 1st day of each month, beginning with the first full month of the term hereof.\n\n4. SECURITY DEPOSIT: A security deposit of One Thousand Dollars ($1000.00) is to be paid to the Landlord prior to move-in, to cover potential damages or any unpaid amounts due under the terms of this Agreement.\n\n5. UTILITIES: Tenant shall be responsible for all utilities as it relates to the Premises including, but not limited to, electricity, gas, trash collection, and water. \n\n6. USE OF PREMISES: Tenant will use the Premises for residential purposes only and shall comply with all laws, ordinances, and regulations concerning the use and condition of the Premises.\n\n7. MAINTENANCE AND REPAIRS: Tenant shall maintain the Premises in good, clean, and habitable condition. Any major repairs required must be communicated by the Tenant to the Landlord immediately.\n\n8. PET POLICY: No pets shall be kept on the Premises without prior written consent of the Landlord.\n\n9. CONTACT INFORMATION: Tenant shall provide Landlord with a telephone number for contact, which shall remain +1-356-233-7465 throughout the duration of the tenancy.\n\n10. NOTICE REQUIREMENTS: Any notice required to be given under this lease shall be given in writing and delivered either personally or by certified mail to the recipient. Notices to Tenant shall be addressed to 599 Edward Fields Suite 154, Greerburgh, IA 19183. Notices to Landlord shall be addressed to: Johnathan Properties, LLC, 12 Birch Lane, Greerburgh, IA 19183.\n\nIN WITNESS WHEREOF, the parties hereto have executed this Lease Agreement as of the day and year last written below.\n\nLandlord Signature: _______________________ Date: ___________\n\nTenant Signature: _________________ April Chambers Date: 1993-06-08"},{"content":"{\"fields_to_redact\":[{\"string\":\"June, 1993\",\"pii_type\":\"date\"},{\"string\":\"April Chambers\",\"pii_type\":\"person_name\"},{\"string\":\"12 Birch Lane, Greerburgh, IA 19183\",\"pii_type\":\"street_address\"},{\"string\":\"599 Edward Fields Suite 154, Greerburgh, IA 19183\",\"pii_type\":\"street_address\"},{\"string\":\"8th June 1993\",\"pii_type\":\"date\"},{\"string\":\"+1-356-233-7465\",\"pii_type\":\"phone_number\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a bank_statement. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"```\nBANK OF DYLANBURGH\nOfficial Bank Statement\nStatement Date: 1980-08-23\n\nAccount Holder: Élisabeth Hubert\nMailing Address: \n8127 Thomas Shores\nDylanburgh, NT X9M9E1\n\nAccount Number: INAW79333072440824\nEmail Contact: dawn79@example.net\n\nTRANSACTION DETAILS:\n\nDATE DESCRIPTION AMOUNT BALANCE\n------------------------------------------------------------------------\n1980-08-01 Direct Deposit +$2,500.00 $2,500.00\n1980-08-05 Grocery Mart -$150.75 $2,349.25\n1980-08-10 Dylanburgh Electricity Co. -$60.50 $2,288.75\n1980-08-15 Coffee Delight Café -$12.30 $2,276.45\n1980-08-18 Online Shopping - Bookstore -$35.90 $2,240.55\n1980-08-20 Monthly Rent Payment -$800.00 $1,440.55\n\nThank you for banking with us, Élisabeth! If you have any queries regarding your transactions, please contact us at customer.service@bankofdylanburgh.com or call 1-800-555-1234.\n\nReminders:\n- Keep your account in good standing by monitoring your transactions regularly.\n- Our customer service is available 24/7 for any assistance.\n- Update your personal details to receive timely notifications.\n\n------------------------------------------------------------------------\n\nTHIS DOCUMENT IS CONFIDENTIAL AND INTENDED FOR THE ACCOUNT HOLDER ONLY. PLEASE MAINTAIN THIS DOCUMENT SECURELY.\n```"},{"content":"{\"fields_to_redact\":[{\"string\":\"1980-08-23\",\"pii_type\":\"date\"},{\"string\":\"Élisabeth Hubert\",\"pii_type\":\"person_name\"},{\"string\":\"8127 Thomas Shores\\nDylanburgh, NT X9M9E1\",\"pii_type\":\"street_address\"},{\"string\":\"INAW79333072440824\",\"pii_type\":\"banking_number\"},{\"string\":\"dawn79@example.net\",\"pii_type\":\"email_address\"},{\"string\":\"1980-08-01\",\"pii_type\":\"date\"},{\"string\":\"1980-08-05\",\"pii_type\":\"date\"},{\"string\":\"1980-08-10\",\"pii_type\":\"date\"},{\"string\":\"1980-08-15\",\"pii_type\":\"date\"},{\"string\":\"1980-08-18\",\"pii_type\":\"date\"},{\"string\":\"1980-08-20\",\"pii_type\":\"date\"},{\"string\":\"Élisabeth\",\"pii_type\":\"person_name\"},{\"string\":\"customer.service@bankofdylanburgh.com\",\"pii_type\":\"email_address\"},{\"string\":\"1-800-555-1234\",\"pii_type\":\"phone_number\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a rental_agreement. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"RENTAL AGREEMENT\n\nThis Rental Agreement (\"Agreement\") is made and entered into on this 28th day of February, 2024, by and between:\n\nLandlord: Olympus Property Holdings LLC (“Landlord”), whose mailing address is 500 Mount Olympus Boulevard, San Dulce María los altos, CAMP 56482.\n\nAND\n\nTenant: Thomas Williams\n\nFor the property located at:\nBoulevard Rwanda 919 631\nSan Dulce María los altos, CAMP 56482\n\nContact Information for Tenant:\nPhone Number: +1-267-940-9831x604\n\n1. TERM: \nThe term of this lease shall commence on March 1, 2024, and shall continue through February 29, 2025, unless terminated sooner as provided herein.\n\n2. RENT:\nTenant agrees to pay a monthly rent of $1,200.00 (One Thousand Two Hundred Dollars), due on the first day of each month. All payments shall be made payable to Olympus Property Holdings LLC and sent to the address mentioned above.\n\n3. SECURITY DEPOSIT:\nA security deposit of $1,200.00 (One Thousand Two Hundred Dollars) is required at the beginning of the lease term and will be held in trust by the Landlord as collateral for damages beyond normal wear and tear.\n\n4. UTILITIES:\nTenant is responsible for all utilities, including electricity, water, gas, and internet services, associated with the property during the lease term.\n\n5. USE OF PREMISES:\nThe premises shall be used as a residential dwelling and shall not be used for any unlawful purpose.\n\n6. MAINTENANCE AND REPAIRS:\nTenant shall keep and maintain the premises in good condition and repair, reporting any damage or maintenance needs to the Landlord promptly.\n\n7. PET POLICY:\nPets are permitted on the premises with prior consent from Landlord and are subject to an additional pet deposit of $500.00 (Five Hundred Dollars).\n\n8. TERMINATION:\nEither party may terminate this Agreement by giving a 30-day written notice. Early lease termination by Tenant may result in a penalty fee equivalent to one month’s rent.\n\n9. GOVERNING LAW:\nThis Agreement shall be governed by the laws of the state of CAMP.\n\nIN WITNESS WHEREOF, Landlord and Tenant have caused this Agreement to be executed as of the day and year first above written.\n\n_________________________ _________________________\nOlympus Property Holdings LLC Thomas Williams\n\nDate: _________________________ Date: 2024-02-28\n\nWitness: ________________________"},{"content":"{\"fields_to_redact\":[{\"string\":\"February, 2024\",\"pii_type\":\"date\"},{\"string\":\"500 Mount Olympus Boulevard, San Dulce María los altos, CAMP 56482\",\"pii_type\":\"street_address\"},{\"string\":\"Rwanda\",\"pii_type\":\"nationality\"},{\"string\":\"+1-267-940-9831x604\",\"pii_type\":\"phone_number\"},{\"string\":\"March 1, 2024\",\"pii_type\":\"date\"},{\"string\":\"February 29, 2025\",\"pii_type\":\"date\"},{\"string\":\"Olympus Property Holdings LLC\",\"pii_type\":\"organization_name\"},{\"string\":\"Thomas Williams\",\"pii_type\":\"person_name\"},{\"string\":\"2024-02-28\",\"pii_type\":\"date\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a company_memo. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"---\n\n**FROM:** Melissa Parks \n**TO:** All Staff \n**DATE:** March 9, 1985 \n**SUBJECT:** Partnership with Jones, Rollins and Joseph\n\n---\n\nDear Team,\n\nI am excited to announce that our organization is embarking on a transformative journey by partnering with the esteemed law firm Jones, Rollins and Joseph. This collaboration is set to commence on the 15th of March, 1985, marking a significant milestone in our growth trajectory.\n\nFounded on the principles of integrity, expertise, and client satisfaction, Jones, Rollins and Joseph has been a pillar in the legal industry for over five decades. Their dedication to delivering unparalleled service aligns perfectly with our own ambitious vision of excellence and innovation.\n\nThroughout this partnership, we will gain access to a wealth of legal expertise and resources that will enable us to navigate complex industry challenges with confidence. It is our shared belief that this collaboration will facilitate new opportunities, streamline our operations, and ultimately enhance our reputation as leaders in our respective fields.\n\nI encourage all departments to lean into this opportunity to work closely with our new partners. Let's ensure that our practices are fully aligned as we move forward together. Further details on the integration process will be provided during our upcoming town hall meeting on March 12, 1985.\n\nIn preparation for this exciting new chapter, I would like to extend my gratitude to each of you for your continued hard work and commitment. Your efforts have been integral to making this collaboration a reality.\n\nShould you have any questions or require additional information, please feel free to reach out to my office directly.\n\nWarm regards,\n\nMelissa Parks \nChief Executive Officer \n[Your Organization Name] \n\nThis memorandum is confidential and intended solely for the use of the individuals or entity to whom it is addressed. Unauthorized disclosure, reproduction, or distribution is prohibited without express permission from [Your Organization Name]."},{"content":"{\"fields_to_redact\":[{\"string\":\"March 9, 1985\",\"pii_type\":\"date\"},{\"string\":\"Jones, Rollins and Joseph\",\"pii_type\":\"organization_name\"},{\"string\":\"15th of March, 1985\",\"pii_type\":\"date\"},{\"string\":\"Jones, Rollins and Joseph\",\"pii_type\":\"organization_name\"},{\"string\":\"March 12, 1985\",\"pii_type\":\"date\"},{\"string\":\"Melissa Parks\",\"pii_type\":\"person_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a rental_agreement. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"### RESIDENTIAL TENANCY AGREEMENT\n\nThis Residential Tenancy Agreement is made and entered into on the 19th day of July, 2002, by and between:\n\n**Landlord:**\nMademoiselle Antoinette Lefevre \nAddress: 27, Allée du Chateau \n95432 Montrouge \nEmail: antoinette.lefevre@landlords.net \nPhone: 001-682-319-7721\n\n**Tenant:**\nMonsieur Matthieu du Marion \nStreet Address: 15, rue de Giraud, 95543 Gros \nPhone Number: 001-546-208-5668 \nEmail Address: arthur21@example.net\n\n### RENTAL PROPERTY\n\nAddress of Rental Property: **15, rue de Giraud, 95543 Gros**\n\n### TERM OF TENANCY\n\nCommencement Date of Tenancy: **19th July 2002** \nEnd Date of Tenancy: **18th July 2003**\n\n### RENT\n\nThe Monthly Rent for the Rental Property shall be **€1,200**, payable on or before the 5th day of each month to the Landlord via direct bank transfer.\n\n### SECURITY DEPOSIT\n\nThe Tenant shall pay a Security Deposit of **€1,200** upon signing this Agreement. The deposit shall be held by the Landlord as security for the Tenant's fulfillment of their obligations under this Agreement.\n\n### UTILITIES\n\nThe Tenant agrees to be fully responsible for all utilities associated with the Rental Property, including electricity, water, gas, and internet charges.\n\n### USE OF PREMISES\n\nThe Tenant agrees to use the premises solely for residential purposes and to comply with all local laws and regulations.\n\n### PET POLICY\n\nPets are permitted on the premises with prior written consent of the Landlord. An additional security deposit of **€200** is required for tenants with pets.\n\n### ADDITIONAL PROVISIONS\n\n1. The Tenant agrees to maintain the premises in a clean and habitable state.\n2. The Tenant shall not conduct any business or illegal activity on the premises.\n3. The Tenant agrees to promptly report any maintenance issues to the Landlord.\n\n### SIGNATURES\n\n**Landlord Signature:** ___________________ \n**Date:** ___________________\n\n**Tenant Signature:** Matthieu du Marion \n**Date:** 19th July 2002\n\n---\n\nPlease retain a copy of this agreement for your records."},{"content":"{\"fields_to_redact\":[{\"string\":\"19th day of July, 2002\",\"pii_type\":\"date\"},{\"string\":\"Mademoiselle Antoinette Lefevre\",\"pii_type\":\"person_name\"},{\"string\":\"27, Allée du Chateau\",\"pii_type\":\"street_address\"},{\"string\":\"antoinette.lefevre@landlords.net\",\"pii_type\":\"email_address\"},{\"string\":\"001-682-319-7721\",\"pii_type\":\"phone_number\"},{\"string\":\"Monsieur Matthieu du Marion\",\"pii_type\":\"person_name\"},{\"string\":\"15, rue de Giraud, 95543 Gros\",\"pii_type\":\"street_address\"},{\"string\":\"001-546-208-5668\",\"pii_type\":\"phone_number\"},{\"string\":\"arthur21@example.net\",\"pii_type\":\"email_address\"},{\"string\":\"15, rue de Giraud, 95543 Gros\",\"pii_type\":\"street_address\"},{\"string\":\"19th July 2002\",\"pii_type\":\"date\"},{\"string\":\"18th July 2003\",\"pii_type\":\"date\"},{\"string\":\"Matthieu du Marion\",\"pii_type\":\"person_name\"},{\"string\":\"19th July 2002\",\"pii_type\":\"date\"}]}","refusal":null,"role":"assistant"}],"rejected_message":{"content":"{\"fields_to_redact\":[{\"string\":\"19th day of July, 2002\",\"pii_type\":\"date\"},{\"string\":\"Mademoiselle Antoinette Lefevre\",\"pii_type\":\"person_name\"},{\"string\":\"27, Allée du Chateau\\n95432 Montrouge\",\"pii_type\":\"street_address\"},{\"string\":\"antoinette.lefevre@landlords.net\",\"pii_type\":\"email_address\"},{\"string\":\"001-682-319-7721\",\"pii_type\":\"phone_number\"},{\"string\":\"Monsieur Matthieu du Marion\",\"pii_type\":\"person_name\"},{\"string\":\"15, rue de Giraud, 95543 Gros\",\"pii_type\":\"street_address\"},{\"string\":\"001-546-208-5668\",\"pii_type\":\"phone_number\"},{\"string\":\"arthur21@example.net\",\"pii_type\":\"email_address\"},{\"string\":\"19th July 2002\",\"pii_type\":\"date\"},{\"string\":\"18th July 2003\",\"pii_type\":\"date\"},{\"string\":\"Matthieu du Marion\",\"pii_type\":\"person_name\"},{\"string\":\"19th July 2002\",\"pii_type\":\"date\"}]}","refusal":null,"role":"assistant","tool_calls":[]},"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a company_memo. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"**Company Memo**\n\nTo: All Employees \nFrom: Jade Harris, Director of Communications \nDate: June 6, 1986 \nSubject: Exciting Partnership Announcement\n\nDear Team,\n\nI hope this memo finds you well. As we continually seek to expand our horizons and foster innovation, I am thrilled to announce a groundbreaking partnership that aligns with our mission to lead the market in quality and service.\n\nEffective immediately, Marchal Seguin SA, a trailblazer in manufacturing solutions, has joined forces with us to venture into new markets and expedite the development of next-generation technologies. This collaboration signifies not only a major milestone in our growth trajectory but also enhances the depth and breadth of our expertise.\n\nMarchal Seguin SA has an impeccable reputation in the industry, known for their innovation and integrity, and we are honored to work alongside them. This partnership will bring about transformative enhancements to our product lines, allowing us to better serve our clients and achieve our shared vision of excellence.\n\nAll departments will receive further instructions in the coming weeks on how to integrate our operations seamlessly. I encourage everyone to embrace this opportunity with the full confidence that it will propel us to new heights.\n\nPlease feel free to reach out to your department heads or directly to me if you have any queries or require further information.\n\nLet us march together towards a brighter future!\n\nBest regards,\n\nJade Harris \nDirector of Communications "},{"content":"{\"fields_to_redact\":[{\"string\":\"June 6, 1986\",\"pii_type\":\"date\"},{\"string\":\"Marchal Seguin SA\",\"pii_type\":\"organization_name\"},{\"string\":\"Marchal Seguin SA\",\"pii_type\":\"organization_name\"},{\"string\":\"Jade Harris\",\"pii_type\":\"person_name\"},{\"string\":\"Jade Harris\",\"pii_type\":\"person_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a support_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Immediate Assistance Required\n\nDate: January 27, 1999 \nFrom: alicia60@example.org\n\nDear Support Team,\n\nMy name is Frédérique Dupuis and I am reaching out to you on behalf of Savage, Rivera and Walker. Our organization has encountered an issue that requires your urgent attention. \n\nRecently, while attempting to update our records, I noticed discrepancies that may be related to a potential breach. My personal ID, 185047505626329, seems to have been used without authorization to access secure files within our database.\n\nAs a practicing Christian, I believe in honesty and transparency, and I'm deeply concerned about this situation. I trust in your company’s expertise and swift action in resolving security matters.\n\nFurthermore, please ensure that our organization's sensitive data remains confidential and take any necessary steps to prevent such incidents in the future.\n\nThank you for your prompt attention to this matter. Please contact me at alicia60@example.org to confirm receipt of this email and for any further instructions.\n\nSincerely,\n\nFrédérique Dupuis \nSavage, Rivera and Walker"},{"content":"{\"fields_to_redact\":[{\"string\":\"January 27, 1999\",\"pii_type\":\"date\"},{\"string\":\"alicia60@example.org\",\"pii_type\":\"email_address\"},{\"string\":\"Frédérique Dupuis\",\"pii_type\":\"person_name\"},{\"string\":\"Savage, Rivera and Walker\",\"pii_type\":\"organization_name\"},{\"string\":\"185047505626329\",\"pii_type\":\"personal_id\"},{\"string\":\"Christian\",\"pii_type\":\"religious_affiliation\"},{\"string\":\"alicia60@example.org\",\"pii_type\":\"email_address\"},{\"string\":\"Frédérique Dupuis\",\"pii_type\":\"person_name\"},{\"string\":\"Savage, Rivera and Walker\",\"pii_type\":\"organization_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a support_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Urgent Assistance Required with Your Garrett.biz Account \n\nDate: June 14, 1975\n\nFrom: Stacey Archer \nTo: Kathryn Thomas-Newton \n\nDear Kathryn Thomas-Newton,\n\nI hope this message finds you well. I am reaching out from the Garrett.biz customer support team concerning your recent inquiry.\n\nWe received your request for assistance with your account settings earlier this week. Our team has analyzed the issue and we are here to provide the necessary steps to help you resolve any difficulties you are experiencing.\n\nIf you could provide us with a detailed description of the issue, including any error messages or unusual behavior you encounter, it will help us address your specific needs more effectively.\n\nMeanwhile, if you need immediate assistance, we have set up a direct line for priority users like yourself. You can contact us anytime at +1-408-818-0671x03088, and one of our senior specialists will be happy to assist you.\n\nWe understand how important your time is, and we strive to ensure that your experience with Garrett.biz remains seamless. Please do not hesitate to get in touch with any further questions or concerns.\n\nThank you for your cooperation and patience.\n\nBest regards,\n\nStacey Archer \nCustomer Support Specialist \nGarrett.biz\n\nP.S. Maintaining your account security is our top priority. Please do not share your login credentials with anyone."},{"content":"{\"fields_to_redact\":[{\"string\":\"June 14, 1975\",\"pii_type\":\"date\"},{\"string\":\"stacey41@example.net\",\"pii_type\":\"email_address\"},{\"string\":\"kathryn_thomasnewton@garrett.biz\",\"pii_type\":\"email_address\"},{\"string\":\"+1-408-818-0671x03088\",\"pii_type\":\"phone_number\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a medical_record. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"**Confidential Medical Record**\n\n---\n\n**Patient Information:**\n\n- **Name:** Kelly Baker \n- **Date of Birth:** November 2, 2018 \n- **Age:** 91 years \n- **Gender:** Female\n\n**Address:** \n32798 Johnson Island \nKellerfort, SC 07391 \n\n---\n\n**Medical History:**\n\n- **Primary Condition:** Temporomandibular Joint Disorder (TMJ) \n\n**Diagnosis Date:** October 12, 1995 \n\n**Symptoms Observed:**\n\n- Jaw pain and tenderness\n- Difficulty chewing\n- Clicking or popping sounds in the jaw joint\n- Locking of the jaw joint\n\n**Treatment Plan:**\n\n- Continued use of oral splints and mouthguards to prevent teeth grinding.\n- Suggested physical therapy exercises targeting jaw muscles.\n- Regular application of ice packs for pain relief.\n- Prescribed low-dose muscle relaxants to ease jaw tension.\n\n**Lifestyle and Risk Factors:**\n\n- Reports of stress-induced jaw clenching.\n- Patient observes a diet that minimizes hard foods to reduce jaw strain.\n- Advised to practice relaxation techniques, such as yoga and meditation, to decrease stress levels.\n\n**Follow-Up:**\n\n- **Next Appointment:** Scheduled for February 20, 2024, at Kellerfort Medical Center\n- **Contact Physician:** Dr. Elaine Trevors\n\n**Notes:**\n\nPatient's advanced age necessitates close monitoring of overall health and the ongoing management of TMJ symptoms to preserve quality of life.\n\n---\n\n**End of Record**\n\n[This medical record is subject to HIPAA regulations. Unauthorized disclosure of this information is prohibited.]"},{"content":"{\"fields_to_redact\":[{\"string\":\"Kelly Baker\",\"pii_type\":\"person_name\"},{\"string\":\"November 2, 2018\",\"pii_type\":\"date_of_birth\"},{\"string\":\"91 years\",\"pii_type\":\"age\"},{\"string\":\"Female\",\"pii_type\":\"gender\"},{\"string\":\"32798 Johnson Island\",\"pii_type\":\"street_address\"},{\"string\":\"Kellerfort, SC 07391\",\"pii_type\":\"street_address\"},{\"string\":\"Temporomandibular Joint Disorder\",\"pii_type\":\"medical_condition\"},{\"string\":\"October 12, 1995\",\"pii_type\":\"date\"},{\"string\":\"February 20, 2024\",\"pii_type\":\"date\"},{\"string\":\"Kellerfort Medical Center\",\"pii_type\":\"organization_name\"},{\"string\":\"Dr. Elaine Trevors\",\"pii_type\":\"person_name\"}]}","refusal":null,"role":"assistant"}],"rejected_message":{"content":"{\"fields_to_redact\":[{\"string\":\"Kelly Baker\",\"pii_type\":\"person_name\"},{\"string\":\"November 2, 2018\",\"pii_type\":\"date_of_birth\"},{\"string\":\"91 years\",\"pii_type\":\"age\"},{\"string\":\"Female\",\"pii_type\":\"gender\"},{\"string\":\"32798 Johnson Island\\nKellerfort, SC 07391\",\"pii_type\":\"street_address\"},{\"string\":\"October 12, 1995\",\"pii_type\":\"date\"},{\"string\":\"February 20, 2024\",\"pii_type\":\"date\"},{\"string\":\"Dr. Elaine Trevors\",\"pii_type\":\"person_name\"}]}","refusal":null,"role":"assistant","tool_calls":[]},"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a personal_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Exciting New Opportunities!\n\nHi Stephanie,\n\nI hope this message finds you well. It has been a while since we last caught up, and I wanted to share some exciting news with you. There are some new project opportunities coming up at Padrón y Juárez y Asociados, and I immediately thought of you!\n\nAs you know, our firm has been expanding rapidly over the past year, and we are currently looking for experienced professionals to join our team. Your expertise in market analysis would be a perfect fit for our upcoming venture in the renewable energy sector.\n\nLet's schedule a time to chat further. Please let me know your availability over the next week so we can discuss this in more detail. You can reach me directly at my work email or we can grab a cup of coffee if you'll be in town anytime soon.\n\nLooking forward to catching up soon!\n\nWarm regards,\n\nIsabelle García\nBusiness Development Lead\nPadrón y Juárez y Asociados\n\nP.S. Feel free to reach me at my personal email too if that's easier for you: zaugle.bise@example.com.\n\n---\nThis email is intended for Stephanie Shaw-Taylor (you) and is sent from zaragozaitzel@example.net. If you received this message in error, please notify us immediately by replying to this message and delete the original email. Thank you for your cooperation."},{"content":"{\"fields_to_redact\":[{\"string\":\"zaugle.bise@example.com\",\"pii_type\":\"email_address\"},{\"string\":\"zaragozaitzel@example.net\",\"pii_type\":\"email_address\"},{\"string\":\"Stephanie Shaw-Taylor\",\"pii_type\":\"person_name\"},{\"string\":\"Isabelle García\",\"pii_type\":\"person_name\"},{\"string\":\"Padrón y Juárez y Asociados\",\"pii_type\":\"organization_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a insurance_policy. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"---\n\n**INSURANCE POLICY DOCUMENT**\n\n**Policyholder Information:**\n\n- **Name:** Brian Branch\n- **Date of Birth:** November 15, 1956 (Age: 67)\n- **Personal ID:** 137-06-7728-869-511\n- **Email Address:** debora11@example.com\n- **Contact Number:** (555) 897-4321\n- **Address:** 742 Maple Street, Grovetown, Massachusetts, 01352\n\n---\n\n**Policy Details:**\n\n- **Policy Number:** POL-BC-8934761\n- **Policy Type:** Comprehensive Health Coverage\n- **Effective Date:** January 1, 2023\n- **Expiration Date:** December 31, 2023\n- **Premium Amount:** $3,200 annually\n- **Payment Schedule:** Monthly ($266.67 per installment)\n\n---\n\n**Coverage Information:**\n\nBrian Branch’s policy covers a wide range of health services:\n\n- **Hospitalization and Surgery**\n- **Outpatient Services**\n- **Prescription Medications**\n- **Preventive Care Services**\n- **Emergency Room Services**\n\n**Special Conditions:**\n\n- **Pre-Existing Medical Condition:** Oral Cancer\n - **Coverage:** Includes treatment, chemotherapy, and follow-up check-ups with certified oncologists. Limited to $150,000 per insurance period.\n - **Exclusions:** Experimental treatments are not covered.\n\n**Additional Coverage:**\n\n- **Dental and Vision:** Supplemental plans available upon request.\n- **Mental Health Services:** Included as part of the standard plan.\n\n---\n\n**Policyholder Rights and Responsibilities:**\n\n- Policyholder must inform the insurance provider of any changes in medical condition, residence, or contact information within 30 days.\n- Claims must be submitted within 90 days of service for reimbursement eligibility.\n- For detailed terms, conditions, and exclusions, refer to the full policy documents. Requests for these documents can be made by emailing service@ourhealthinsure.com.\n\n**Customer Service:**\n\nFor assistance, contact our customer service hotline at 1-800-555-INSURE (4678) or email: support@ourhealthinsure.com.\n\n--- \n\n**End of Document**"},{"content":"{\"fields_to_redact\":[{\"string\":\"Brian Branch\",\"pii_type\":\"person_name\"},{\"string\":\"November 15, 1956\",\"pii_type\":\"date_of_birth\"},{\"string\":\"67\",\"pii_type\":\"age\"},{\"string\":\"137-06-7728-869-511\",\"pii_type\":\"personal_id\"},{\"string\":\"debora11@example.com\",\"pii_type\":\"email_address\"},{\"string\":\"(555) 897-4321\",\"pii_type\":\"phone_number\"},{\"string\":\"742 Maple Street, Grovetown, Massachusetts, 01352\",\"pii_type\":\"street_address\"},{\"string\":\"Oral Cancer\",\"pii_type\":\"medical_condition\"},{\"string\":\"service@ourhealthinsure.com\",\"pii_type\":\"email_address\"},{\"string\":\"support@ourhealthinsure.com\",\"pii_type\":\"email_address\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a company_memo. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"---\n\nTo: All Employees \nFrom: Henriette Benard, Director of Human Resources \nDate: April 24, 1981 \nSubject: New Initiatives and Updates from Roberts PLC \n\nDear Team,\n\nI hope this memo finds you all well. As we are in the midst of a transformative year, I am writing to update you on several key initiatives and organizational changes within Roberts PLC. These changes are designed to enhance productivity, foster innovation, and ensure our long-term success.\n\n1. **Employee Wellness Program** \nWe are excited to launch a comprehensive Employee Wellness Program starting next month. This initiative aims to promote mental and physical health by offering gym memberships, on-site yoga classes, and wellness workshops. Participation is voluntary, but we encourage everyone to take advantage of these resources.\n\n2. **Roberts PLC Innovation Lab** \nTo stay ahead in the competitive landscape, we are establishing the Roberts PLC Innovation Lab. This lab will be a hub for creative thinking and problem-solving, allowing cross-departmental teams to develop new ideas and prototypes. We invite interested employees to apply for a position in this cutting-edge facility.\n\n3. **Quarterly Feedback System** \nWe value your input and are introducing a streamlined Quarterly Feedback System, accessible through our intranet. This will replace the annual review system and allow for more frequent communication and quicker decision-making processes.\n\n4. **Sustainability Efforts** \nRoberts PLC is committed to reducing our carbon footprint. Our goal is to achieve a 30% reduction in emissions by the end of 1985. Initiatives include transitioning to renewable energy sources and implementing stricter waste management protocols. All departments will receive guidelines for contributing to these goals.\n\nYour dedication and hard work are what drive Roberts PLC forward. Together, we are building a future where our shared success is achieved with sustainability, innovation, and employee well-being at the forefront.\n\nPlease feel free to reach out with any questions or suggestions. I look forward to seeing these initiatives unfold and our company continue to thrive.\n\nThank you for your commitment and enthusiasm.\n\nBest regards,\n\nHenriette Benard \nDirector of Human Resources \nRoberts PLC \n\n---"},{"content":"{\"fields_to_redact\":[{\"string\":\"April 24, 1981\",\"pii_type\":\"date\"},{\"string\":\"Henriette Benard\",\"pii_type\":\"person_name\"},{\"string\":\"Roberts PLC\",\"pii_type\":\"organization_name\"},{\"string\":\"Roberts PLC\",\"pii_type\":\"organization_name\"},{\"string\":\"Roberts PLC\",\"pii_type\":\"organization_name\"},{\"string\":\"Henriette Benard\",\"pii_type\":\"person_name\"},{\"string\":\"Roberts PLC\",\"pii_type\":\"organization_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a support_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Trouble Accessing Account - Immediate Assistance Needed\n\nFrom: evanserin@example.com \nTo: support@polarnetworks.com \nDate: 2014-09-22 \n\nDear Polar Networks Support Team,\n\nI hope this message finds you well. I am writing to seek assistance with an issue I am experiencing while attempting to access my Polar Networks account. As of this morning, I have been unable to log in, and it's causing some disruption as I heavily rely on your services for my business communications.\n\nThe problem began around 09:00 AM today when I received an error message stating that my credentials were incorrect. I am certain that the password I entered was correct. I have tried the 'forgot password' feature, but unfortunately, I haven't received any reset email yet.\n\nAs I was preparing for an important presentation, it is crucial to access certain files stored in my account. I would appreciate it if your technical team could look into this matter with urgency. For your reference, my account is linked to evanserin@example.com, and my customer reference number is ZZ 632124 T.\n\nPlease advise on the necessary steps I need to take to resolve this matter at your earliest convenience.\n\nThank you in advance for your prompt assistance.\n\nWarm regards,\n\nErin Evans \nFounder & COO, Evans Enterprises \nPhone: (555) 839-4725"},{"content":"{\"fields_to_redact\":[{\"string\":\"evanserin@example.com\",\"pii_type\":\"email_address\"},{\"string\":\"2014-09-22\",\"pii_type\":\"date\"},{\"string\":\"evanserin@example.com\",\"pii_type\":\"email_address\"},{\"string\":\"ZZ 632124 T\",\"pii_type\":\"personal_id\"},{\"string\":\"(555) 839-4725\",\"pii_type\":\"phone_number\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a company_memo. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Updated Policy on Remote Work Arrangements \n\n**Ainoa Seguí Galván S.L.N.E** \n**Memo:** COMPANY-REM-WK-2020-07-21 \n**To:** All Employees \n**From:** HR Department \n**Date:** July 21, 2020 \n\nDear Team, \n\nWe hope this message finds you well. In response to the ongoing changes in workplace dynamics and based on feedback from our recent employee survey, we're pleased to introduce an updated policy on remote work arrangements. \n\nPlease note the key changes: \n\n1. **Eligibility:** All full-time employees with a tenure of at least six months are now eligible for remote work, unless otherwise specified by department needs.\n\n2. **Schedule Flexibility:** Employees can opt to work remotely up to three days a week. Schedules should be coordinated with department heads to ensure smooth operations.\n\n3. **Equipment Support:** To ensure a productive home setup, our IT department will conduct virtual assessments and provide necessary equipment. Please be on the lookout for an email from Mariana Ramon (marianaramon@example.org) with details on scheduling your assessment.\n\n4. **Personal Data Policy:** Please be reminded that all remote activities must comply with our Personal Data Handling Policy. Ensure all personal identification such as your employee ID is always protected. Be cautious not to disclose information like your personal ID number (e.g., 092-59-9122) unnecessarily.\n\n5. **Tech Support:** Our tech support team is available to assist with technical issues during remote engagement. Reach out through our support portal or directly via email.\n\nWe remain committed to supporting our team's well-being and accommodating the evolving work environment. Please review the complete policy attached with this memo and direct any questions or feedback to your line manager.\n\nYour efforts and flexibility are greatly appreciated as we continue to navigate these changes together. \n\nBest Regards, \nHR Department \nAinoa Seguí Galván S.L.N.E \n\n**Attachment:** Remote_Work_Policy_July2020.pdf \n\nConfidentiality Notice: This memo contains information that is proprietary and confidential to Ainoa Seguí Galván S.L.N.E. If you are not the intended recipient, please notify the sender by email and delete the message."},{"content":"{\"fields_to_redact\":[{\"string\":\"Ainoa Seguí Galván S.L.N.E\",\"pii_type\":\"organization_name\"},{\"string\":\"July 21, 2020\",\"pii_type\":\"date\"},{\"string\":\"Mariana Ramon\",\"pii_type\":\"person_name\"},{\"string\":\"marianaramon@example.org\",\"pii_type\":\"email_address\"},{\"string\":\"092-59-9122\",\"pii_type\":\"personal_id\"},{\"string\":\"Ainoa Seguí Galván S.L.N.E\",\"pii_type\":\"organization_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a support_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Urgent: Assistance Required for Account Access\n\nDate: 1981-04-07\n\nDear Support Team,\n\nI hope this message finds you well. I am writing to seek immediate assistance regarding access issues with my professional account. I have been experiencing difficulties logging in, and it's imperative that this be resolved swiftly due to the nature of my work responsibilities.\n\nPlease allow me to provide some details for your reference:\n\n- **Name**: Miss Rachael Owen\n- **Date of Birth**: 1971-03-20\n- **Email Address**: kbilbao@example.com\n- **Demographic Group**: Hispanic or Latino\n- **Personal ID**: 126-20-8754\n- **Alternative ID**: 341 043 511\n\nFor security purposes, I am sharing my secure credential here: (CpTjSs(39 – Please advise if any further identity verification is needed.\n\nCould you please investigate this incident at your earliest convenience? I have a pressing deadline approaching and require immediate access to resume my tasks effectively. Your prompt attention to this matter would be greatly appreciated.\n\nThank you very much for your support and understanding. I am looking forward to your prompt response.\n\nWarm regards,\n\nMiss Rachael Owen\n\nContact Number: [Confidential] \n\nP.S.: If possible, could you also confirm the best practices for ensuring this issue does not recur in the future? Thank you in advance."},{"content":"{\"fields_to_redact\":[{\"string\":\"1981-04-07\",\"pii_type\":\"date\"},{\"string\":\"Rachael Owen\",\"pii_type\":\"person_name\"},{\"string\":\"1971-03-20\",\"pii_type\":\"date_of_birth\"},{\"string\":\"kbilbao@example.com\",\"pii_type\":\"email_address\"},{\"string\":\"Hispanic or Latino\",\"pii_type\":\"demographic_group\"},{\"string\":\"126-20-8754\",\"pii_type\":\"personal_id\"},{\"string\":\"341 043 511\",\"pii_type\":\"other_id\"},{\"string\":\"CpTjSs(39\",\"pii_type\":\"secure_credential\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a support_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Urgent: Login Issue and Banking Support Needed\n\nDate: April 23, 1988\n\nFrom: Luce Faivre \n\nTo: Jean.com Support Team\n\nDear Support Team,\n\nI hope this message finds you well. I am writing to seek your assistance regarding an urgent issue I'm experiencing with accessing my account associated with your services.\n\nRecently, I've encountered persistent difficulties logging into my account on your website, **jean.com**. This is particularly concerning given that I need to access certain secured documents stored within my account for an ongoing project. Despite multiple attempts, my secure credential \"6N^1U_$oIT\" does not authorize successful access, prompting a \"Credential Error\" message each time. Please advise on the next steps to resolve this issue.\n\nAdditionally, I've noticed unexplained charges linked to my banking account number FLYJ01276102084527. It appears there might have been an unauthorized transaction, and I would like to dispute these charges as they do not align with my recent activities. If there is any paperwork or additional information required to expedite this process, kindly let me know.\n\nI trust your esteemed support in addressing these issues promptly. Kindly confirm the receipt of this email, and I look forward to your immediate response, as time is of the essence in resolving these matters.\n\nThank you for your attention and support.\n\nBest regards,\n\nLuce Faivre"},{"content":"{\"fields_to_redact\":[{\"string\":\"April 23, 1988\",\"pii_type\":\"date\"},{\"string\":\"Luce Faivre\",\"pii_type\":\"person_name\"},{\"string\":\"jeronimosolano@example.net\",\"pii_type\":\"email_address\"},{\"string\":\"jean.com\",\"pii_type\":\"domain_name\"},{\"string\":\"6N^1U_$oIT\",\"pii_type\":\"secure_credential\"},{\"string\":\"FLYJ01276102084527\",\"pii_type\":\"banking_number\"},{\"string\":\"Luce Faivre\",\"pii_type\":\"person_name\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a personal_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Catching Up & Exciting News 🎉\n\nHi Suzanne,\n\nI hope this email finds you in great spirits!\n\nI wanted to take a moment to reach out and let you know about some exciting news on my end. After months of anticipation, I've finally accepted the position at the San Francisco branch as the senior project manager. It's a thrilling opportunity, and I'm looking forward to the new challenges it brings.\n\nAlso, I fondly remember our last coffee rendezvous at that quaint little bistro on Rue de L'Arbre. We should definitely plan to do that again soon! Perhaps you could come visit and enjoy a weekend in the Bay Area?\n\nOn a different note, I wanted to remind you that the last day for submitting those important documents is fast approaching. Could you please send me the final versions by November 10, 2023? It’s crucial for our end-of-year assessments.\n\nHope everything is magical with you and that your arrangements for the upcoming holiday season are shaping up splendidly. I'm so excited to hear all about your plans!\n\nThank you once again for all your help and unwavering support. It's truly valued more than words can express. Feel free to call me anytime or drop me a line at moreauraymond@example.net.\n\nLooking forward to catching up soon!\n\nWarm regards,\n\nRaymond Moreau \nPersonal ID: 765-56-1682 \nP.S. Give my love to Hugo and the cats!"},{"content":"{\"fields_to_redact\":[{\"string\":\"moreauraymond@example.net\",\"pii_type\":\"email_address\"},{\"string\":\"November 10, 2023\",\"pii_type\":\"date\"},{\"string\":\"765-56-1682\",\"pii_type\":\"personal_id\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a support_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Issue with Account Access\n\nDate: Saturday, September 21, 1974\nFrom: Vicki Fischer \nTo: support@jenkins-black.info\n\nDear Jenkins-Black Support Team,\n\nI hope this message finds you well. My name is Jeff Burnett, and I am experiencing difficulty accessing my account associated with jenkins-black.info. Despite multiple attempts to reset my password, I keep encountering an error that prevents me from logging in.\n\nBelow are the error details:\n- Error Message: \"Password Reset Unsuccessful. Please Try Again Later.\"\n- Attempts made: 5 attempts since 09/15/1974.\n\nMoreover, I haven’t received the password reset link on my registered email, vickifischer@example.net. I have checked my spam folder but to no avail. To assist you further in resolving this issue, here are a few details about my account and contact information:\n\n- Full Name: Jeff Burnett\n- Registered Email: vickifischer@example.net\n- Phone Number: 001-914-578-1622\n- Physical Address: Prolongación Oaxaca 026 Interior 863\n San Julio los altos, BC 08535-9816\n\nI would appreciate if you could look into this matter at your earliest convenience, as this account is crucial for my ongoing project. Feel free to reach out to me via email or my direct phone number listed above if you require any additional information.\n\nThank you for your prompt attention to this issue. I look forward to your response.\n\nWarm regards,\n\nJeff Burnett \nSan Julio los altos, BC \n001-914-578-1622"},{"content":"{\"fields_to_redact\":[{\"string\":\"September 21, 1974\",\"pii_type\":\"date\"},{\"string\":\"vickifischer@example.net\",\"pii_type\":\"email_address\"},{\"string\":\"jenkins-black.info\",\"pii_type\":\"domain_name\"},{\"string\":\"09/15/1974\",\"pii_type\":\"date\"},{\"string\":\"vickifischer@example.net\",\"pii_type\":\"email_address\"},{\"string\":\"Jeff Burnett\",\"pii_type\":\"person_name\"},{\"string\":\"vickifischer@example.net\",\"pii_type\":\"email_address\"},{\"string\":\"001-914-578-1622\",\"pii_type\":\"phone_number\"},{\"string\":\"Prolongación Oaxaca 026 Interior 863\\n San Julio los altos, BC 08535-9816\",\"pii_type\":\"street_address\"},{\"string\":\"Jeff Burnett\",\"pii_type\":\"person_name\"},{\"string\":\"001-914-578-1622\",\"pii_type\":\"phone_number\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a personal_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Catching Up and Future Plans\n\nHi Keith,\n\nI hope this email finds you well. It's been a while since our last conversation, and I thought I’d reach out to see how everything is going.\n\nFirstly, congratulations on your recent promotion at Robinson and Sons! I'm sure your keen eye for detail and innovative mindset were crucial assets in earning this recognition. How are you finding your new role? I can only imagine the exciting projects you must be leading. Please share if there's anything groundbreaking that Robinson and Sons is delving into these days. I've always admired the forward-thinking atmosphere there.\n\nAlso, I wanted to let you know that I forwarded your email invitation regarding the annual tech symposium to my colleagues who might be interested. I would've loved to join, but unfortunately, I have prior commitments on that date. However, I'm sure it will be a great success with the lineup you have planned.\n\nOn a personal note, I've been dedicating more time to photography—finally took that leap with a new DSLR! If you’re ever interested, I’d love to go on a photo walk sometime soon. Maybe we could explore some of the new trails around the city. Nature and autumn hues can be such inspiration, right?\n\nPlease send regards to your family. If you’re free one of these weekends, let’s catch up over coffee. Just shoot me a reply or call when you have the time.\n\nLooking forward to hearing from you!\n\nBest,\n[Your Name]\n\nEmail: jesus59@example.net"},{"content":"{\"fields_to_redact\":[{\"string\":\"Robinson and Sons\",\"pii_type\":\"organization_name\"},{\"string\":\"jesus59@example.net\",\"pii_type\":\"email_address\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a personal_email. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"Subject: Exciting News from Italy!\n\nHi Alex,\n\nI hope this email finds you well. I just had to share some thrilling news with a dear friend like you!\n\nI recently got a chance to take a spontaneous trip to Italy. Remember how we always talked about visiting Rome together during our university days? Well, I finally made it! The Colosseum was beyond words, and the gelato—simply divine!\n\nBut here's the cherry on top: I met someone amazing! His name is Matteo, and he's a local artist with the most enchanting stories about the city's history. We bonded over art and long walks along the Tiber River. I can't wait to introduce you two someday!\n\nPlease give my regards to Chloe; I've attached a few pictures for the both of you to enjoy. Can't wait to catch up when I return next week.\n\nTake care and talk soon!\n\nWarm regards, \nSara Brown\n\nP.S. Please use my temporary email, tluna@example.org, in case you need to reach me while I'm here. Having some trouble with the usual one!"},{"content":"{\"fields_to_redact\":[{\"string\":\"Sara Brown\",\"pii_type\":\"person_name\"},{\"string\":\"Matteo\",\"pii_type\":\"person_name\"},{\"string\":\"tluna@example.org\",\"pii_type\":\"email_address\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} +{"messages":[{"role":"system","content":"\n You are an expert model trained to redact potentially sensitive information from documents. You have been given a document to redact. The document is a educational_transcript. Your goal is to accurately redact the sensitive information from the document. Sensitive information can be in one of the following categories:\n \n - age: a person's age\n- credit_card_info: a credit card number, expiration date, CCV, etc.\n- nationality: a country when used to reference place of birth, residence, or citizenship\n- date: a specific calendar date\n- date_of_birth: a specific calendar date representing birth\n- domain_name: a domain on the internet\n- email_address: an email ID\n- demographic_group: Anything that identifies race or ethnicity\n- gender: a gender identifier\n- personal_id: Any ID string like a national ID, subscriber number, etc.\n- other_id: Any ID not associated with a person like an organization ID, database ID, etc.\n- banking_number: a number associated with a bank account\n- medical_condition: A diagnosis, treatment code or other information identifying a medical condition\n- organization_name: name of an organization\n- person_name: name of a person\n- phone_number: a telephone number\n- street_address: a physical address\n- password: a secure string used for authentication\n- secure_credential: any secure credential like an API key, private key, 2FA token\n- religious_affiliation: anything that identifies religious affiliation\n \n You should return the specific string that needs to be redacted, along with the category of sensitive information that it belongs to. If there is no sensitive information in the document, return no strings. \n "},{"role":"user","content":"**Educational Transcript**\n\n**Name:** Angela Snyder \n**Date of Birth:** January 29, 2024 \n**Student ID:** JS2024-AGSN123 \n**Enrollment Year:** 2029 \n**Graduation Year:** 2033 \n\n**School/Organization Name:** Joyce and Sons Education Institute \n**Major:** Environmental Sciences \n**Minor:** Creative Writing \n\n---\n\n**Academic Performance Overview:**\n\n**Freshman Year (2029-2030):** \n- Introduction to Environmental Science - Grade: A \n- Basics of Ecology - Grade: A- \n- Creative Writing Fundamentals - Grade: B+ \n- Statistics for Environmental Research - Grade: A \n\n**Sophomore Year (2030-2031):** \n- Climate Change and Global Warming - Grade: B \n- Advanced Composition and Rhetoric - Grade: A \n- Plant Biology and Genetics - Grade: B \n- English Literature I - Grade: B+ \n\n**Junior Year (2031-2032):** \n- Waste Management and Recycling - Grade: A- \n- Renewable Energy Sources - Grade: A \n- Poetry and Poetic Devices - Grade: A- \n- World Literature - Grade: B+ \n\n**Senior Year (2032-2033):** \n- Marine and Coastal Environment - Grade: A \n- Environmental Policy and Legislation - Grade: A- \n- Fiction Writing Workshop - Grade: B+ \n- Seminar in Sustainability - Grade: A \n\n**Extracurricular Activities:**\n\n- President of the Environmental Awareness Club (2031-2033) \n- Editor for the School's Literary Magazine, \"Penscape\" (2031-2032) \n- Volunteer for the Coastal Clean-Up Initiative (2029-2033) \n- Participant in the Inter-School Debate Championship (2030) \n\n---\n\n**School Comments:** \nAngela has shown remarkable dedication to her studies and extracurricular activities, demonstrating leadership in environmental advocacy. Her creative flair in writing complements her analytical skills in scientific inquiry. Angela's multi-disciplinary approach will serve her well in future endeavors.\n\n**Official Seal:** \n*Joyce and Sons Education Institute* \n*Excellence through Enlightenment* \n\n**Signature:** \n______________________ \nRegistrar, Joyce and Sons\n\n**Date of Issue:** June 10, 2033"},{"content":"{\"fields_to_redact\":[{\"string\":\"Angela Snyder\",\"pii_type\":\"person_name\"},{\"string\":\"January 29, 2024\",\"pii_type\":\"date_of_birth\"},{\"string\":\"JS2024-AGSN123\",\"pii_type\":\"personal_id\"},{\"string\":\"Joyce and Sons Education Institute\",\"pii_type\":\"organization_name\"},{\"string\":\"Angela\",\"pii_type\":\"person_name\"},{\"string\":\"June 10, 2033\",\"pii_type\":\"date\"}]}","refusal":null,"role":"assistant"}],"tools":[],"response_format":{"type":"json_schema","json_schema":{"name":"RedactionResponse","schema":{"type":"object","$defs":{"RedactionField":{"type":"object","title":"RedactionField","required":["string","pii_type"],"properties":{"string":{"type":"string","title":"String","description":"The exact matching string to redact. Include any whitespace or punctuation. Must be an exact string match!"},"pii_type":{"enum":["age","credit_card_info","nationality","date","date_of_birth","domain_name","email_address","demographic_group","gender","personal_id","other_id","banking_number","medical_condition","organization_name","person_name","phone_number","street_address","password","secure_credential","religious_affiliation"],"type":"string","title":"Pii Type"}},"additionalProperties":false}},"title":"RedactionResponse","required":["fields_to_redact"],"properties":{"fields_to_redact":{"type":"array","items":{"$ref":"#/$defs/RedactionField"},"title":"Fields To Redact"}},"additionalProperties":false},"strict":true}},"split":"TRAIN"} diff --git a/dev/sft/distillation.py b/dev/sft/distillation.py new file mode 100644 index 00000000..dfb2cf8c --- /dev/null +++ b/dev/sft/distillation.py @@ -0,0 +1,67 @@ +"""Distillation example: Train a small model using completions from a large model.""" + +import asyncio +import os + +from dotenv import load_dotenv +from openai import AsyncOpenAI + +import art +from art.local import LocalBackend + +load_dotenv() + +if not os.environ.get("OPENROUTER_API_KEY"): + raise ValueError("OPENROUTER_API_KEY environment variable is required") + +TEACHER_MODEL = "qwen/qwen3-235b-a22b-2507" +STUDENT_BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct" +PROMPT = "Explain the concept of recursion in programming with a simple example." + + +async def main(): + # Get completion from teacher model + teacher_client = AsyncOpenAI( + api_key=os.environ["OPENROUTER_API_KEY"], + base_url="https://openrouter.ai/api/v1", + ) + + print(f"Getting completion from teacher model ({TEACHER_MODEL})...") + completion = await teacher_client.chat.completions.create( + model=TEACHER_MODEL, + messages=[{"role": "user", "content": PROMPT}], + ) + teacher_response = completion.choices[0].message.content + print( + f"Teacher response ({len(teacher_response)} chars):\n{teacher_response[:500]}..." + ) + + # Create trajectory from teacher completion + trajectory = art.Trajectory( + messages_and_choices=[ + {"role": "user", "content": PROMPT}, + {"role": "assistant", "content": teacher_response}, + ], + reward=0.0, + ) + + # Train student model + backend = LocalBackend() + student = art.TrainableModel( + name="distillation-demo-11", + project="sft-distillation", + base_model=STUDENT_BASE_MODEL, + ) + await student.register(backend) + + print(f"Training student model ({STUDENT_BASE_MODEL})...") + await student.train_sft( + [trajectory, trajectory, trajectory], + config=art.SFTConfig(learning_rate=2e-4), + verbose=True, + ) + print("Training complete!") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/dev/sft/sft-from-file.py b/dev/sft/sft-from-file.py new file mode 100644 index 00000000..3a0973e1 --- /dev/null +++ b/dev/sft/sft-from-file.py @@ -0,0 +1,31 @@ +"""Simple SFT training script using train_sft_from_file helper.""" + +import asyncio + +import art +from art.local import LocalBackend +from art.utils.sft import train_sft_from_file + + +async def main(): + backend = LocalBackend() + model = art.TrainableModel( + name="run-5", + project="sft-from-file", + base_model="Qwen/Qwen2.5-7B-Instruct", + ) + await model.register(backend) + + await train_sft_from_file( + model=model, + file_path="dev/sft/dataset.jsonl", + epochs=1, + chunk_size=10, + peak_lr=2e-4, + ) + + print("Training complete!") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/dev/sft/sft-warmup-before-rl.py b/dev/sft/sft-warmup-before-rl.py new file mode 100644 index 00000000..0cd258d8 --- /dev/null +++ b/dev/sft/sft-warmup-before-rl.py @@ -0,0 +1,123 @@ +"""Minimal example demonstrating SFT -> RL -> SFT switching.""" + +import asyncio +import os + +from dotenv import load_dotenv + +import art +from art.local import LocalBackend + +# Simple SFT trajectories - teach model to respond "maybe" +SFT_TRAJECTORIES = [ + art.Trajectory( + messages_and_choices=[ + {"role": "user", "content": "respond with yes or no"}, + {"role": "assistant", "content": "maybe"}, + ], + reward=0.0, # reward unused for SFT + ), +] * 10 + + +async def rl_rollout(client, model_name: str, prompt: str) -> art.Trajectory: + """Single RL rollout with reward based on response.""" + messages: art.Messages = [{"role": "user", "content": prompt}] + completion = await client.chat.completions.create( + messages=messages, model=model_name, max_tokens=10, timeout=30 + ) + choice = completion.choices[0] + content = choice.message.content or "" + + # Reward: "maybe" > "no" > "yes" > other + reward = {"maybe": 1.0, "no": 0.75, "yes": 0.5}.get(content.strip().lower(), 0.0) + return art.Trajectory(messages_and_choices=[*messages, choice], reward=reward) + + +async def main(): + load_dotenv() + + backend = LocalBackend() + model = art.TrainableModel( + name="sft-rl-switch-test-13", + project="sft-rl-demo", + base_model="Qwen/Qwen2.5-7B-Instruct", + ) + await model.register(backend) + + # ======================================================================== + # Phase 1: SFT + # ======================================================================== + print("\n[Phase 1] SFT training...") + await model.train_sft( + SFT_TRAJECTORIES, + config=art.SFTConfig(learning_rate=2e-6), + ) + print("SFT phase 1 complete.") + + # ======================================================================== + # Phase 2: RL (GRPO) + # ======================================================================== + print("\n[Phase 2] RL training...") + client = model.openai_client() + prompt = "respond with yes, no, or maybe" + + for i in range(5): + print(f" RL step {i + 1}") + train_groups = await art.gather_trajectory_groups( + [ + art.TrajectoryGroup( + rl_rollout(client, model.name, prompt) for _ in range(6) + ) + for _ in range(12) + ] + ) + await model.train(train_groups, config=art.TrainConfig(learning_rate=1e-5)) + print("RL phase 2 complete.") + + # ======================================================================== + # Phase 3: SFT again + # ======================================================================== + print("\n[Phase 3] SFT training again...") + await model.train_sft( + SFT_TRAJECTORIES, + config=art.SFTConfig(batch_size=1, learning_rate=2e-6), + ) + print("SFT phase 3 complete.") + + # ======================================================================== + # Phase 4: RL (GRPO) again + # ======================================================================== + print("\n[Phase 4] RL training...") + client = model.openai_client() + prompt = "respond with yes, no, or maybe" + + for i in range(5): + print(f" RL step {i + 1}") + train_groups = await art.gather_trajectory_groups( + [ + art.TrajectoryGroup( + rl_rollout(client, model.name, prompt) for _ in range(6) + ) + for _ in range(12) + ] + ) + await model.train(train_groups, config=art.TrainConfig(learning_rate=1e-5)) + print("RL phase 4 complete.") + + # ======================================================================== + # Test: Check model output + # ======================================================================== + print("\n[Test] Model output after training:") + completion = await client.chat.completions.create( + messages=[{"role": "user", "content": "respond with yes or no"}], + model=model.name, + max_tokens=10, + ) + print(f"Response: {completion.choices[0].message.content}") + + print("\nAll phases complete!") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/pyproject.toml b/pyproject.toml index 232ef220..04fd0bfe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ dependencies = [ "openai>=2.14.0", "typer>=0.15.2", "litellm>=1.74.1", - "weave>=0.52.23", + "weave>=0.52.24", "tinker>=0.7.0", "tinker-cookbook>=0.1.0", "polars>=1.26.0", @@ -32,7 +32,7 @@ backend = [ "accelerate==1.7.0", "awscli>=1.38.1", "setuptools>=78.1.0", - "wandb==0.23.1", + "wandb==0.24.0", "transformers>=4.55.2,<=4.57.3", "duckdb>=1.0.0", "pyarrow>=15.0.0", diff --git a/src/art/__init__.py b/src/art/__init__.py index 75957e17..fef9b1f4 100644 --- a/src/art/__init__.py +++ b/src/art/__init__.py @@ -64,6 +64,7 @@ def __init__(self, **kwargs): Messages, MessagesAndChoices, ServerlessTrainResult, + SFTConfig, Tools, TrainConfig, TrainResult, @@ -89,6 +90,7 @@ def __init__(self, **kwargs): "Model", "TrainableModel", "retry", + "SFTConfig", "TrainConfig", "TrainResult", "TinkerBackend", diff --git a/src/art/backend.py b/src/art/backend.py index c9a8bed8..c32761f5 100644 --- a/src/art/backend.py +++ b/src/art/backend.py @@ -14,8 +14,8 @@ ) from . import dev -from .trajectories import TrajectoryGroup -from .types import TrainConfig, TrainResult +from .trajectories import Trajectory, TrajectoryGroup +from .types import SFTConfig, TrainConfig, TrainResult if TYPE_CHECKING: from .model import Model, TrainableModel @@ -149,6 +149,21 @@ async def _train_model( if pbar is not None: pbar.close() + async def _train_sft( + self, + model: "TrainableModel", + trajectories: Iterable[Trajectory], + config: SFTConfig, + dev_config: dev.SFTConfig, + verbose: bool = False, + ) -> AsyncIterator[dict[str, float]]: + raise NotImplementedError( + "SFT training is not yet implemented. " + "This method will be available in a future release." + ) + # This yield is unreachable but makes this an async generator + yield # type: ignore + # ------------------------------------------------------------------ # Experimental support for S3 # ------------------------------------------------------------------ diff --git a/src/art/dev/__init__.py b/src/art/dev/__init__.py index 45e1cdef..c31fc557 100644 --- a/src/art/dev/__init__.py +++ b/src/art/dev/__init__.py @@ -8,7 +8,7 @@ TrainerArgs, ) from .openai_server import OpenAIServerConfig, ServerArgs, get_openai_server_config -from .train import TrainConfig +from .train import SFTConfig, TrainConfig __all__ = [ "EngineArgs", @@ -21,5 +21,6 @@ "get_openai_server_config", "OpenAIServerConfig", "ServerArgs", + "SFTConfig", "TrainConfig", ] diff --git a/src/art/dev/openai_server.py b/src/art/dev/openai_server.py index 2f28560f..e6f400d1 100644 --- a/src/art/dev/openai_server.py +++ b/src/art/dev/openai_server.py @@ -18,18 +18,19 @@ def get_openai_server_config( config = OpenAIServerConfig() log_file = config.get("log_file", log_file) - # Extract step from lora_path for multi-checkpoint support - # lora_path format is: {output_dir}/checkpoints/{step:04d} - lora_name = model_name + # Build LoRA modules list for multi-checkpoint support + # Register under both model_name (for "current" model) and model_name@step (for specific checkpoint) + lora_modules: list[str] | None = None if lora_path: step = int(os.path.basename(lora_path)) - lora_name = f"{model_name}@{step}" + lora_modules = [ + f'{{"name": "{model_name}", "path": "{lora_path}"}}', + f'{{"name": "{model_name}@{step}", "path": "{lora_path}"}}', + ] server_args = ServerArgs( api_key="default", - lora_modules=( - [f'{{"name": "{lora_name}", "path": "{lora_path}"}}'] if lora_path else None - ), + lora_modules=lora_modules, return_tokens_as_token_ids=True, enable_auto_tool_choice=True, tool_call_parser="hermes", @@ -37,7 +38,7 @@ def get_openai_server_config( server_args.update(config.get("server_args", {})) engine_args = EngineArgs( model=base_model, - served_model_name=base_model if lora_path else model_name, + served_model_name=model_name, generation_config="vllm", ) engine_args.update(config.get("engine_args", {})) diff --git a/src/art/dev/train.py b/src/art/dev/train.py index bd415074..5822d673 100644 --- a/src/art/dev/train.py +++ b/src/art/dev/train.py @@ -27,3 +27,17 @@ class TrainConfig(TypedDict, total=False): scale_learning_rate_by_reward_std_dev: bool scale_rewards: bool truncated_importance_sampling: float | None + + +class SFTConfig(TypedDict, total=False): + """Experimental SFT configuration options. Use at your own risk. + + Undocumented options (may change): + instruction_part: Override auto-detected instruction marker for tokenization. + Used to identify where user turns begin in the chat template. + response_part: Override auto-detected response marker for tokenization. + Used to identify where assistant turns begin (train on responses only). + """ + + instruction_part: str + response_part: str diff --git a/src/art/local/backend.py b/src/art/local/backend.py index 2f9be638..0c9d6f21 100644 --- a/src/art/local/backend.py +++ b/src/art/local/backend.py @@ -10,6 +10,7 @@ import aiohttp import numpy as np from openai import AsyncOpenAI +import polars as pl import torch from tqdm import auto as tqdm from transformers import AutoImageProcessor, AutoTokenizer @@ -41,7 +42,7 @@ ) from ..preprocessing.tokenize import tokenize_trajectory_groups from ..trajectories import Trajectory, TrajectoryGroup -from ..types import LocalTrainResult, Message, TrainConfig +from ..types import LocalTrainResult, Message, SFTConfig, TrainConfig from ..utils import format_message, get_model_step from .checkpoints import ( delete_checkpoints, @@ -349,7 +350,6 @@ async def _monitor_openai_server( # Otherwise, continue and try again # Note: _log() method has been moved to the Model class (frontend) - def _trajectory_log(self, trajectory: Trajectory) -> str: """Format a trajectory into a readable log string.""" header = f"reward: {trajectory.reward} {' '.join(f'{k}: {v}' for k, v in trajectory.metrics.items())}\n\n" @@ -613,8 +613,119 @@ async def _train_model( if verbose: print("_train_model complete") - # Note: _get_reward_std_dev_learning_rate_multiplier and _log_metrics - # have been moved to the Model class (frontend) + async def _train_sft( + self, + model: TrainableModel, + trajectories: Iterable[Trajectory], + config: SFTConfig, + dev_config: dev.SFTConfig, + verbose: bool = False, + ) -> AsyncIterator[dict[str, float]]: + """Train the model using supervised fine-tuning. + + Args: + model: The trainable model to fine-tune + trajectories: Iterable of Trajectory objects + config: SFT configuration with batch_size and learning rates + dev_config: Developer configuration + verbose: Whether to print detailed logs + + Yields: + Dictionary containing training metrics for each batch + """ + if verbose: + print("Starting _train_sft") + + # Convert iterator to list + trajectory_list = list(trajectories) + + if len(trajectory_list) == 0: + if verbose: + print("No trajectories to train on") + return + + # Get tokenizer + if model.base_model not in self._tokenizers: + self._tokenizers[model.base_model] = AutoTokenizer.from_pretrained( + model.base_model + ) + tokenizer = self._tokenizers[model.base_model] + + # Determine batch_size + batch_size = config.batch_size + if batch_size == "auto": + batch_size = 1 # Default to 1 for SFT + + # Determine learning rates + if config.custom_lr_schedule and len(config.custom_lr_schedule) > 0: + # Use custom learning rate schedule + learning_rates = config.custom_lr_schedule + else: + # Use constant learning rate for all batches + num_batches = math.ceil(len(trajectory_list) / batch_size) + learning_rates = [config.learning_rate] * num_batches + + # Tokenize trajectories into batches + from ..preprocessing.tokenize import tokenize_sft_batches + from ..utils.model_config import get_instruction_response_parts + + # Get instruction/response parts (from config or auto-detect) + instruction_part = dev_config.get("instruction_part", None) + response_part = dev_config.get("response_part", None) + + if instruction_part is None or response_part is None: + detected_inst, detected_resp = get_instruction_response_parts( + model.base_model, tokenizer + ) + instruction_part = instruction_part or detected_inst + response_part = response_part or detected_resp + + if verbose: + print(f"Using instruction_part: {instruction_part!r}") + print(f"Using response_part: {response_part!r}") + + sft_batches = list( + tokenize_sft_batches( + trajectories=trajectory_list, + batch_size=batch_size, + learning_rates=learning_rates, + tokenizer=tokenizer, + instruction_part=instruction_part, + response_part=response_part, + ) + ) + + if verbose: + total_trainable = sum(b.num_trainable_tokens for b in sft_batches) + print( + f"Tokenized {len(trajectory_list)} trajectories into {len(sft_batches)} batches" + ) + print(f"Total trainable tokens: {total_trainable}") + if total_trainable == 0: + print( + "WARNING: No trainable tokens found! Check instruction_part and response_part settings." + ) + + # Get the service to access the model and optimizer + service = await self._get_service(model) + + # Train using the service's train_sft method + if verbose: + print("Using service.train_sft") + + num_batches = len(sft_batches) + pbar = tqdm.tqdm(total=num_batches, desc="Processing chunk", leave=False) + + async for result in service.train_sft(sft_batches, verbose): + pbar.update(1) + pbar.set_postfix({"loss": f"{result.get('loss', 0):.4f}"}) + + # Note: Metrics logging is handled by the frontend (Model.train_sft()) + yield result + pbar.close() + + if verbose: + print("_train_sft complete") # ------------------------------------------------------------------ # Experimental support for S3 diff --git a/src/art/local/service.py b/src/art/local/service.py index b54ed570..25cee346 100644 --- a/src/art/local/service.py +++ b/src/art/local/service.py @@ -28,3 +28,9 @@ def train( _config: dev.TrainConfig, verbose: bool = False, ) -> AsyncIterator[dict[str, float]]: ... + + def train_sft( + self, + sft_batches: list, + verbose: bool = False, + ) -> AsyncIterator[dict[str, float]]: ... diff --git a/src/art/model.py b/src/art/model.py index 1afd407e..18d489fd 100644 --- a/src/art/model.py +++ b/src/art/model.py @@ -12,7 +12,7 @@ from . import dev from .trajectories import Trajectory, TrajectoryGroup -from .types import TrainConfig +from .types import SFTConfig, TrainConfig from .utils.old_benchmarking.calculate_step_metrics import calculate_step_std_dev from .utils.trajectory_logging import write_trajectory_groups_parquet @@ -672,3 +672,35 @@ async def train( # Get the current step after training step = await self.get_step() self._log_metrics(avg_metrics, "train", step) + + async def train_sft( + self, + trajectories: Iterable[Trajectory], + config: SFTConfig | None = None, + _config: dev.SFTConfig | None = None, + verbose: bool = False, + ) -> None: + """ + Supervised fine-tune the model with an iterable of trajectories. + + Args: + trajectories: An iterable of Trajectory objects. + config: SFT configuration including learning_rates and batch_size. + If None, uses default SFTConfig(). + _config: Additional experimental configuration that is subject to change and + not yet part of the public API. Use at your own risk. + verbose: Whether to print verbose output. + """ + if config is None: + config = SFTConfig() + + # Get starting step from checkpoint for per-batch logging + step = await self.get_step() + + # Train (backend yields metrics for each batch without logging) + async for metrics in self.backend()._train_sft( + self, trajectories, config, _config or {}, verbose + ): + # Log each batch's metrics with incrementing step + step += 1 + self._log_metrics(metrics, "train", step) diff --git a/src/art/serverless/backend.py b/src/art/serverless/backend.py index 53e427ae..e60ec7d0 100644 --- a/src/art/serverless/backend.py +++ b/src/art/serverless/backend.py @@ -9,8 +9,8 @@ from .. import dev from ..backend import Backend -from ..trajectories import TrajectoryGroup -from ..types import ServerlessTrainResult, TrainConfig +from ..trajectories import Trajectory, TrajectoryGroup +from ..types import ServerlessTrainResult, SFTConfig, TrainConfig if TYPE_CHECKING: from ..model import Model, TrainableModel @@ -302,6 +302,21 @@ async def _train_model( raise RuntimeError(f"Training job failed: {error_message}") after = event.id + async def _train_sft( + self, + model: "TrainableModel", + trajectories: Iterable[Trajectory], + config: SFTConfig, + dev_config: dev.SFTConfig, + verbose: bool = False, + ) -> AsyncIterator[dict[str, float]]: + raise NotImplementedError( + "SFT training is not yet implemented for ServerlessBackend. " + "Please use the Backend HTTP API or implement this method." + ) + # This yield is unreachable but makes this an async generator + yield # type: ignore + # ------------------------------------------------------------------ # Experimental support for S3 and checkpoints # ------------------------------------------------------------------ diff --git a/src/art/tinker/service.py b/src/art/tinker/service.py index 1db653c0..8aa6711b 100644 --- a/src/art/tinker/service.py +++ b/src/art/tinker/service.py @@ -184,6 +184,14 @@ def custom_loss_fn( state.sampler_clients[next_step] = new_sampler_client state.latest_step = next_step + async def train_sft( + self, + sft_batches: list, + verbose: bool = False, + ) -> AsyncIterator[dict[str, float]]: + raise NotImplementedError("SFT training is not supported for TinkerService") + yield {} # Make this a generator + async def delete_checkpoints(self, steps_to_keep: list[int]) -> None: state = await self._state_task # Find steps to delete diff --git a/src/art/types.py b/src/art/types.py index df81d684..bc850911 100644 --- a/src/art/types.py +++ b/src/art/types.py @@ -19,6 +19,12 @@ class TrainConfig(pydantic.BaseModel): beta: float = 0.0 +class SFTConfig(pydantic.BaseModel): + learning_rate: float = 2e-4 + batch_size: int | Literal["auto"] = "auto" + custom_lr_schedule: list[float] = [] + + Verbosity = Literal[0, 1, 2] diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py index 182831ba..80987cc8 100644 --- a/src/art/unsloth/service.py +++ b/src/art/unsloth/service.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from functools import cached_property import os -from typing import TYPE_CHECKING, Any, AsyncIterator, Protocol, cast +from typing import TYPE_CHECKING, Any, AsyncIterator, Literal, Protocol, cast from datasets import Dataset import peft @@ -139,13 +139,15 @@ async def process_train_batch( def save_checkpoint( trainer: "GRPOTrainer", output_dir: str, + step: int | None = None, verbose: bool = False, ) -> str: """Save a checkpoint and return the checkpoint directory path.""" if verbose: print("Saving new LoRA adapter...") - next_step = get_step_from_dir(output_dir) + 1 - checkpoint_dir = get_step_checkpoint_dir(output_dir, next_step) + if step is None: + step = get_step_from_dir(output_dir) + 1 + checkpoint_dir = get_step_checkpoint_dir(output_dir, step) os.makedirs(checkpoint_dir, exist_ok=True) trainer.save_model(checkpoint_dir) return checkpoint_dir @@ -261,6 +263,7 @@ class UnslothService: config: dev.InternalModelConfig output_dir: str _is_sleeping: bool = False + _last_training_mode: Literal["sft", "rl"] | None = None _latest_step: int = 0 _lora_id_counter: int = 1 # Start from 1 since 0 is reserved @@ -301,7 +304,6 @@ async def vllm_engine_is_sleeping(self) -> bool: async def register_lora_for_step(self, step: int, checkpoint_dir: str) -> None: """Register a LoRA adapter for a specific checkpoint step. - This is called when training is skipped but the checkpoint is renamed. """ llm = await self.llm @@ -320,6 +322,43 @@ async def register_lora_for_step(self, step: int, checkpoint_dir: str) -> None: self._latest_step = step await llm.resume_generation() + def _get_optimizer(self) -> torch.optim.AdamW: + """Get or create the shared optimizer. + + GRPOTrainer creates its optimizer lazily, so it may be None if we + start with SFT before any RL training. This method ensures the + optimizer exists and is shared between SFT and RL. + """ + if self._state.trainer.optimizer is None: + self._state.trainer.optimizer = torch.optim.AdamW( + self._state.peft_model.parameters(), + lr=1e-4, + betas=(0.9, 0.999), + weight_decay=0.0, + ) + return self._state.trainer.optimizer + + def _reset_optimizer_if_mode_changed( + self, + mode: Literal["sft", "rl"], + ) -> None: + """Reset optimizer state if training mode changed. + + Uses a single shared optimizer (trainer.optimizer) for both SFT and RL. + Resets optimizer state (momentum, variance) only when switching between + training modes to avoid stale state from a different loss landscape. + """ + mode_changed = ( + self._last_training_mode is not None and self._last_training_mode != mode + ) + + if mode_changed: + optimizer = self._get_optimizer() + # Clear all optimizer state (exp_avg, exp_avg_sq, step for each param) + optimizer.state.clear() + + self._last_training_mode = mode + async def train( self, disk_packed_tensors: DiskPackedTensors, @@ -351,6 +390,9 @@ async def train( # Reload training model to GPU (after vLLM is asleep) self._state.reload_to_gpu() + # Reset optimizer state if switching from SFT to RL + self._reset_optimizer_if_mode_changed("rl") + # Load packed tensors packed_tensors = packed_tensors_from_dir(**disk_packed_tensors) @@ -429,6 +471,169 @@ async def train( if verbose: print("UnslothService.train complete") + async def train_sft( + self, + sft_batches: list, + verbose: bool = False, + ) -> AsyncIterator[dict[str, float]]: + """Train the model using supervised fine-tuning. + + Args: + sft_batches: List of SFTBatch objects from tokenize_sft_batches + verbose: Whether to print detailed logs + + Yields: + Dictionary containing training metrics for each batch + """ + llm = await self.llm + + # Pause generation to prevent new requests during training + await llm.pause_generation() + + # Determine sleep level based on outstanding requests + has_unfinished = llm.output_processor.has_unfinished_requests() + if has_unfinished: + sleep_level = 1 + else: + await llm.reset_prefix_cache() + sleep_level = 2 + + # Put workers to sleep + await run_on_workers(llm, do_sleep, level=sleep_level) + self._is_sleeping = True + gc_and_empty_cuda_cache() + + # Reload training model to GPU (after vLLM is asleep) + self._state.reload_to_gpu() + + # Get model + peft_model = self._state.peft_model + + # Reset optimizer state if switching from RL to SFT + # Uses shared trainer.optimizer for both SFT and RL + self._reset_optimizer_if_mode_changed("sft") + optimizer = self._get_optimizer() + + # Reset environment variable that may be set by RL training + os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "0" + + peft_model.train() + optimizer.zero_grad() + device = next(peft_model.parameters()).device + max_grad_norm = 1.0 + + if verbose: + print(f"Training SFT on {len(sft_batches)} batches") + + import time + + # The training loop + for batch_idx, batch in enumerate(sft_batches): + batch_start_time = time.perf_counter() + batch_loss = 0.0 + + # Update learning rate for this batch + for param_group in optimizer.param_groups: + param_group["lr"] = batch.learning_rate + + # Total trainable tokens for loss normalization (normalizes by tokens, not trajectories) + # This ensures each token contributes equally regardless of how trajectories are split + num_items_in_batch = torch.tensor( + batch.num_trainable_tokens, dtype=torch.long, device=device + ) + + # Process each trajectory in the batch (gradient accumulation) + for trajectory_tensor in batch.trajectory_tensors: + # Move tensors to device + input_ids = trajectory_tensor["input_ids"].to(device) + attention_mask = trajectory_tensor["attention_mask"].to(device) + labels = trajectory_tensor["labels"].to(device) + + # Forward pass with num_items_in_batch for proper loss normalization + # Loss = sum(cross_entropy) / num_items_in_batch + outputs = peft_model( + input_ids=input_ids, + attention_mask=attention_mask, + labels=labels, + num_items_in_batch=num_items_in_batch, + ) + + loss = outputs.loss + + # Backward pass - accumulate gradients + loss.backward() + + # Track metrics + batch_loss += loss.item() + + # Gradient clipping + grad_norm = torch.nn.utils.clip_grad_norm_( + peft_model.parameters(), max_grad_norm + ).item() + + # Optimizer step at the end of each batch + optimizer.step() + optimizer.zero_grad() + + # Compute timing metrics + batch_time = time.perf_counter() - batch_start_time + tokens_per_second = ( + batch.num_trainable_tokens / batch_time if batch_time > 0 else 0.0 + ) + + if verbose: + print( + f"Batch {batch_idx}: loss={batch_loss:.4f}, lr={batch.learning_rate:.2e}, " + f"grad_norm={grad_norm:.4f}, tok/s={tokens_per_second:.1f}" + ) + + # Yield metrics + yield { + "loss": batch_loss, + "learning_rate": batch.learning_rate, + "grad_norm": grad_norm, + "num_trajectories": float(batch.num_trajectories), + "num_trainable_tokens": float(batch.num_trainable_tokens), + "tokens_per_second": tokens_per_second, + } + + # Save checkpoint after training + # Name checkpoint by final training step: starting_step + num_batches + final_step = get_step_from_dir(self.output_dir) + len(sft_batches) + checkpoint_dir = save_checkpoint( + trainer=self._state.trainer, + output_dir=self.output_dir, + step=final_step, + verbose=verbose, + ) + + # Offload training model to CPU before waking vLLM + self._state.offload_to_cpu() + + # Free memory before waking up vLLM + gc_and_empty_cuda_cache() + await asyncio.sleep(0.5) + + # Wake up workers + await run_on_workers(llm, do_wake_up) + self._is_sleeping = False + + # Swap out the LoRA adapter with the newly trained checkpoint + await llm.remove_lora(1) + await llm.add_lora( + LoRARequest( + lora_name=self.model_name, + lora_int_id=1, + lora_path=checkpoint_dir, + ) + ) + + # Resume generation after LoRA swap is complete + await llm.resume_generation() + + if verbose: + print("UnslothService.train_sft complete") + @cached_property def _state(self) -> UnslothState: import unsloth diff --git a/src/art/utils/model_config.py b/src/art/utils/model_config.py new file mode 100644 index 00000000..7c9f93a6 --- /dev/null +++ b/src/art/utils/model_config.py @@ -0,0 +1,194 @@ +"""Model-specific configuration for chat templates and training defaults.""" + +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class SFTDefaults: + """Default SFT training parameters for a model.""" + + batch_size: int + learning_rate: float + + +@dataclass +class ModelConfig: + """Configuration for a specific model's chat template.""" + + instruction_part: str + response_part: str + sft_defaults: Optional[SFTDefaults] = None + + +# Model identifier -> configuration mapping +# These define the chat template markers used for "train on responses only" +MODEL_CONFIGS: dict[str, ModelConfig] = { + # Qwen 2.5 models (ChatML format) + "Qwen/Qwen2.5-0.5B-Instruct": ModelConfig( + instruction_part="<|im_start|>user\n", + response_part="<|im_start|>assistant\n", + ), + "Qwen/Qwen2.5-1.5B-Instruct": ModelConfig( + instruction_part="<|im_start|>user\n", + response_part="<|im_start|>assistant\n", + ), + "Qwen/Qwen2.5-3B-Instruct": ModelConfig( + instruction_part="<|im_start|>user\n", + response_part="<|im_start|>assistant\n", + ), + "Qwen/Qwen2.5-7B-Instruct": ModelConfig( + instruction_part="<|im_start|>user\n", + response_part="<|im_start|>assistant\n", + ), + "Qwen/Qwen2.5-14B-Instruct": ModelConfig( + instruction_part="<|im_start|>user\n", + response_part="<|im_start|>assistant\n", + sft_defaults=SFTDefaults(batch_size=2, learning_rate=1e-4), + ), + "Qwen/Qwen2.5-32B-Instruct": ModelConfig( + instruction_part="<|im_start|>user\n", + response_part="<|im_start|>assistant\n", + ), + "Qwen/Qwen2.5-72B-Instruct": ModelConfig( + instruction_part="<|im_start|>user\n", + response_part="<|im_start|>assistant\n", + ), + # Qwen 3 models (with thinking tokens) + "Qwen/Qwen3-8B": ModelConfig( + instruction_part="<|im_start|>user\n", + response_part="<|im_start|>assistant\n\n\n\n\n", + ), + "Qwen/Qwen3-14B": ModelConfig( + instruction_part="<|im_start|>user\n", + response_part="<|im_start|>assistant\n\n\n\n\n", + ), + "Qwen/Qwen3-32B": ModelConfig( + instruction_part="<|im_start|>user\n", + response_part="<|im_start|>assistant\n\n\n\n\n", + ), + "OpenPipe/Qwen3-14B-Instruct": ModelConfig( + instruction_part="<|im_start|>user\n", + response_part="<|im_start|>assistant\n\n\n\n\n", + sft_defaults=SFTDefaults(batch_size=2, learning_rate=1e-4), + ), + "Qwen/Qwen3-30B-A3B-Instruct-2507": ModelConfig( + instruction_part="<|im_start|>user\n", + response_part="<|im_start|>assistant\n", + sft_defaults=SFTDefaults(batch_size=2, learning_rate=1e-4), + ), + # Llama 3 models + "meta-llama/Llama-3.1-8B-Instruct": ModelConfig( + instruction_part="<|start_header_id|>user<|end_header_id|>\n\n", + response_part="<|start_header_id|>assistant<|end_header_id|>\n\n", + ), + "meta-llama/Llama-3.1-70B-Instruct": ModelConfig( + instruction_part="<|start_header_id|>user<|end_header_id|>\n\n", + response_part="<|start_header_id|>assistant<|end_header_id|>\n\n", + ), + "meta-llama/Llama-3.2-1B-Instruct": ModelConfig( + instruction_part="<|start_header_id|>user<|end_header_id|>\n\n", + response_part="<|start_header_id|>assistant<|end_header_id|>\n\n", + ), + "meta-llama/Llama-3.2-3B-Instruct": ModelConfig( + instruction_part="<|start_header_id|>user<|end_header_id|>\n\n", + response_part="<|start_header_id|>assistant<|end_header_id|>\n\n", + ), + # Gemma models + "google/gemma-2-2b-it": ModelConfig( + instruction_part="user\n", + response_part="model\n", + ), + "google/gemma-2-9b-it": ModelConfig( + instruction_part="user\n", + response_part="model\n", + ), + "google/gemma-2-27b-it": ModelConfig( + instruction_part="user\n", + response_part="model\n", + ), +} + + +def get_model_config(model_id: str) -> Optional[ModelConfig]: + """Get the configuration for a given model. + + Args: + model_id: The model identifier (e.g., "Qwen/Qwen2.5-7B-Instruct") + + Returns: + ModelConfig if found, None otherwise + """ + return MODEL_CONFIGS.get(model_id) + + +def detect_chat_template_parts( + tokenizer_or_template: object, +) -> tuple[str, str]: + """Detect instruction and response parts from a chat template string. + + This is a fallback when the model is not in MODEL_CONFIGS. + + Args: + tokenizer_or_template: Either a tokenizer with chat_template attr, + or the chat template string directly + + Returns: + Tuple of (instruction_part, response_part) + """ + if hasattr(tokenizer_or_template, "chat_template"): + template: str = getattr(tokenizer_or_template, "chat_template", "") or "" + elif isinstance(tokenizer_or_template, str): + template = tokenizer_or_template + else: + template = "" + + # ChatML format (Qwen, etc.) + if "<|im_start|>" in template: + return "<|im_start|>user\n", "<|im_start|>assistant\n" + + # Llama 3 format + if "<|start_header_id|>" in template: + return ( + "<|start_header_id|>user<|end_header_id|>\n\n", + "<|start_header_id|>assistant<|end_header_id|>\n\n", + ) + + # Gemma format + if "" in template: + return "user\n", "model\n" + + # Mistral format + if "[INST]" in template: + return "[INST]", "[/INST]" + + # Default fallback to ChatML (most common) + return "<|im_start|>user\n", "<|im_start|>assistant\n" + + +def get_instruction_response_parts( + model_id: str, + tokenizer: Optional[object] = None, +) -> tuple[str, str]: + """Get instruction and response parts for a model. + + First checks MODEL_CONFIGS, then falls back to template detection. + + Args: + model_id: The model identifier + tokenizer: Optional tokenizer for fallback detection + + Returns: + Tuple of (instruction_part, response_part) + """ + # Check explicit config first + config = get_model_config(model_id) + if config is not None: + return config.instruction_part, config.response_part + + # Fallback to detection + if tokenizer is not None: + return detect_chat_template_parts(tokenizer) + + # Ultimate fallback + return detect_chat_template_parts("") diff --git a/src/art/utils/sft.py b/src/art/utils/sft.py new file mode 100644 index 00000000..f672ec1d --- /dev/null +++ b/src/art/utils/sft.py @@ -0,0 +1,491 @@ +"""Utilities for supervised fine-tuning (SFT).""" + +from dataclasses import dataclass +import json +import math +import random +from typing import TYPE_CHECKING, Generator, List, Literal + +from tqdm.auto import tqdm + +if TYPE_CHECKING: + from art.dev import SFTConfig as DevSFTConfig + from art.model import TrainableModel + from art.trajectories import Trajectory + from art.types import SFTConfig + + +@dataclass +class SFTDatasetChunk: + """Container for SFT dataset chunk with trajectories, config, and step information.""" + + trajectories: List["Trajectory"] + config: "SFTConfig" + step: int + epoch: int + epoch_step: int + + +def _parse_jsonl_line(line: str) -> "Trajectory": + """Parse a JSONL line into a Trajectory object. + + Args: + line: A JSON string containing trajectory data with 'messages' and optional 'tools'. + + Returns: + A Trajectory object with the parsed data. + """ + from art.trajectories import Trajectory + + data = json.loads(line) + return Trajectory( + messages_and_choices=data.get("messages", []), + tools=data.get("tools"), + reward=0.0, + ) + + +def get_file_row_count(file_path: str) -> int: + """ + Count the number of non-empty rows in a JSONL file. + + Args: + file_path: Path to JSONL file + + Returns: + Number of non-empty lines in the file + + Raises: + ValueError: If file_path does not end with .jsonl + + Example: + count = get_file_row_count("data.jsonl") + print(f"Dataset has {count} items") + """ + if not file_path.endswith(".jsonl"): + raise ValueError(f"Only JSONL files are supported. Got: {file_path}") + + count = 0 + with open(file_path, "r") as f: + for line in f: + if line.strip(): + count += 1 + return count + + +def create_lr_schedule( + total_steps: int, + peak_lr: float, + method: Literal["cosine", "linear", "constant"] = "linear", + warmup_steps: int = 0, + min_lr: float = 0.0, +) -> List[float]: + """ + Create learning rate schedule for training with optional warmup. + + Args: + total_steps: Total number of training steps + peak_lr: Peak learning rate + method: Learning rate schedule method. Options: + - "cosine": Cosine annealing from peak_lr to min_lr + - "linear": Linear decay from peak_lr to min_lr + - "constant": Constant learning rate (peak_lr for all steps) + warmup_steps: Number of warmup steps (linear warmup from 0 to peak_lr) + min_lr: Minimum learning rate (floor for decay schedules) + + Returns: + List of learning rates for each step + + Example: + # Cosine schedule with warmup + lrs = create_lr_schedule(100, 1e-4, method="cosine", warmup_steps=10) + + # Use with training loop + for step, chunk in enumerate(chunk_trajectories(...)): + train_sft(chunk, learning_rate=lrs[step]) + """ + if total_steps <= 0: + return [] + + learning_rates = [] + decay_steps = total_steps - warmup_steps + + for step in range(total_steps): + if step < warmup_steps: + # Warmup: linear ramp from min_lr to peak_lr + # Use (step + 1) so first step has lr > 0 + lr = min_lr + (peak_lr - min_lr) * ((step + 1) / warmup_steps) + else: + # Decay phase: progress goes from 0 to 1 + progress = ( + (step - warmup_steps) / (decay_steps - 1) if decay_steps > 1 else 0 + ) + if method == "cosine": + lr = min_lr + (peak_lr - min_lr) * 0.5 * ( + 1 + math.cos(math.pi * progress) + ) + elif method == "linear": + lr = peak_lr - (peak_lr - min_lr) * progress + elif method == "constant": + lr = peak_lr + else: + raise ValueError( + f"Unknown method: {method}. Choose from: cosine, linear, constant" + ) + + learning_rates.append(lr) + + return learning_rates + + +def create_sft_dataset_iterator( + trajectories: List["Trajectory"], + epochs: int = 1, + batch_size: int = 1, + chunk_size: int = 50, + peak_lr: float = 2e-4, + schedule_type: Literal["cosine", "linear", "constant"] = "linear", + warmup_ratio: float = 0.1, + initial_step: int = 0, + use_tqdm: bool = True, +) -> Generator[SFTDatasetChunk, None, None]: + """ + Create an iterator that yields SFT dataset chunks with trajectories, config, and step info. + + Combines trajectory batching with learning rate scheduling. Yields SFTDatasetChunk objects + containing flattened trajectories, SFTConfig with learning rates, and step tracking info. + + Args: + trajectories: List of Trajectory objects to train on + epochs: Number of times to iterate over the trajectories. Default: 1 + batch_size: Number of trajectories per batch (one weight update per batch). Default: 1 + chunk_size: Number of batches to process per train_sft call. Default: 50. + This is an internal optimization parameter and does not affect training. + peak_lr: Peak learning rate. Default: 2e-4 + schedule_type: Learning rate schedule type ("cosine", "linear", "constant"). Default: "constant" + warmup_ratio: Ratio of total steps to use for warmup (0.0 to 1.0). Default: 0.1 + initial_step: The global training step (batch) to start from. Default: 0. + Useful for resuming training. + use_tqdm: Whether to display a progress bar. Default: True + + Yields: + SFTDatasetChunk containing: + - trajectories: Flattened list of trajectories for this chunk + - config: SFTConfig with custom_lr_schedule containing learning rates for each batch + - step: Global training step (batch number) at the start of this chunk + - epoch: Current epoch number (0-indexed) + - epoch_step: Training step within current epoch (0-indexed) + + Example: + trajectories = [traj1, traj2, ..., traj100] + + # Create SFT dataset iterator with constant schedule (default) + for chunk in create_sft_dataset_iterator( + trajectories=trajectories, + epochs=3, + batch_size=4, + chunk_size=10, + peak_lr=1e-4, + ): + # chunk.trajectories is a flat list of up to 40 trajectories + # chunk.config.custom_lr_schedule is a list of learning rates (one per batch) + # chunk.config.batch_size is 4 + # chunk.step is global training step (weight update number) + # chunk.epoch is current epoch + # chunk.epoch_step is training step within epoch + await model.train_sft(chunk.trajectories, chunk.config) + + # Resume from training step 50 + for chunk in create_sft_dataset_iterator( + trajectories=trajectories, + epochs=3, + batch_size=4, + chunk_size=10, + initial_step=50, + ): + # Starts from training step 50 + pass + """ + from art.types import SFTConfig + + if chunk_size < 1: + raise ValueError(f"chunk_size must be >= 1, got {chunk_size}") + + dataset_size = len(trajectories) + if dataset_size == 0: + return + + # Calculate total batch steps (one step per batch) + batches_per_epoch = math.ceil(dataset_size / batch_size) + total_batch_steps = batches_per_epoch * epochs + + # Calculate warmup steps + warmup_steps = int(total_batch_steps * warmup_ratio) + + # Create learning rate schedule (one LR per batch) + custom_lr_schedule = create_lr_schedule( + total_steps=total_batch_steps, + peak_lr=peak_lr, + method=schedule_type, + warmup_steps=warmup_steps, + min_lr=0.0, + ) + + # Calculate chunk iteration parameters + items_per_chunk = batch_size * chunk_size + chunks_per_epoch = math.ceil(dataset_size / items_per_chunk) + + # Convert initial_step (batch-based) to initial_chunk for skipping + initial_chunk = initial_step // chunk_size + + progress_bar = None + if use_tqdm: + progress_bar = tqdm( + initial=initial_step, + total=total_batch_steps, + desc="Training SFT", + unit="step", + ) + + for epoch in range(epochs): + # Create indices and shuffle deterministically based on epoch + indices = list(range(dataset_size)) + rng = random.Random(epoch) + rng.shuffle(indices) + + for chunk_idx in range(chunks_per_epoch): + # Calculate global chunk index for skipping + global_chunk_idx = epoch * chunks_per_epoch + chunk_idx + + # Skip if before initial_chunk + if global_chunk_idx < initial_chunk: + continue + + # Get indices for this chunk + chunk_start = chunk_idx * items_per_chunk + chunk_end = min(chunk_start + items_per_chunk, dataset_size) + step_indices = indices[chunk_start:chunk_end] + + # Flatten trajectories for this chunk + chunk_trajectories: List["Trajectory"] = [ + trajectories[idx] for idx in step_indices + ] + + # Calculate learning rates for each batch in this chunk + chunk_lrs: List[float] = [] + num_batches_in_chunk = math.ceil(len(step_indices) / batch_size) + + # Calculate global batch step at the start of this chunk + global_batch_step = epoch * batches_per_epoch + (chunk_start // batch_size) + + for batch_idx in range(num_batches_in_chunk): + chunk_lrs.append(custom_lr_schedule[global_batch_step + batch_idx]) + + # Create SFTConfig with custom learning rate schedule + config = SFTConfig( + batch_size=batch_size, + custom_lr_schedule=chunk_lrs, + ) + + # epoch_step is the batch step within the current epoch + epoch_batch_step = chunk_start // batch_size + + yield SFTDatasetChunk( + trajectories=chunk_trajectories, + config=config, + step=global_batch_step, + epoch=epoch, + epoch_step=epoch_batch_step, + ) + + # Update progress bar by the number of batches in this chunk + if progress_bar: + progress_bar.update(num_batches_in_chunk) + + if progress_bar: + progress_bar.close() + + +def iterate_file( + file_path: str, + epochs: int, + shuffle: bool = True, + shuffle_buffer_size: int = 10000, + seed: int | None = 42, +) -> Generator["Trajectory", None, None]: + """ + Read JSONL file for each epoch, yielding individual Trajectory objects. + + Completes reading the entire file for one epoch before starting the next epoch. + This ensures all trajectories from epoch N are yielded before any from epoch N+1. + + Each line should contain a dict with: + - messages: List of chat messages + - tools: Optional list of tools + - reward: Optional reward (defaults to 0.0) + - split: Optional split name (stored in metadata) + - Any other fields will be stored in metadata + + Args: + file_path: Path to JSONL file (one JSON object per line) + epochs: Number of times to read through the file + shuffle: Whether to shuffle trajectories. Defaults to True. + shuffle_buffer_size: Size of shuffle buffer for streaming shuffle. Default: 10000. + Only used if shuffle=True. + seed: Random seed for deterministic shuffling. Default: 42. + Only used if shuffle=True. + + Yields: + Individual Trajectory objects + + Raises: + ValueError: If file_path does not end with .jsonl + + Example: + # With shuffle + for trajectory in iterate_file("data.jsonl", epochs=3, shuffle=True): + # trajectory is a single Trajectory object + process(trajectory) + + # No shuffle + for trajectory in iterate_file("data.jsonl", epochs=3, shuffle=False): + process(trajectory) + """ + if not file_path.endswith(".jsonl"): + raise ValueError(f"Only JSONL files are supported. Got: {file_path}") + + for epoch in range(epochs): + # Use local Random instance to avoid modifying global random state + if seed is not None: + rng = random.Random(seed + epoch) + else: + rng = random.Random() + + if shuffle: + # Streaming shuffle with buffer + shuffle_buffer: List["Trajectory"] = [] + + with open(file_path, "r") as f: + for line in f: + if not line.strip(): + continue + + traj = _parse_jsonl_line(line) + shuffle_buffer.append(traj) + + # Once buffer is full, start yielding randomly + if len(shuffle_buffer) >= shuffle_buffer_size: + idx = rng.randint(0, len(shuffle_buffer) - 1) + yield shuffle_buffer.pop(idx) + + # Flush remaining items in shuffle buffer at end of epoch + rng.shuffle(shuffle_buffer) + for traj in shuffle_buffer: + yield traj + else: + # No shuffle - sequential reading + with open(file_path, "r") as f: + for line in f: + if not line.strip(): + continue + + yield _parse_jsonl_line(line) + + +async def train_sft_from_file( + model: "TrainableModel", + file_path: str, + epochs: int = 1, + batch_size: int = 1, + chunk_size: int = 50, + peak_lr: float = 2e-4, + schedule_type: Literal["cosine", "linear", "constant"] = "linear", + warmup_ratio: float = 0.1, + initial_step: int = 0, + use_tqdm: bool = True, + _config: "DevSFTConfig | None" = None, + verbose: bool = False, +) -> None: + """ + Train a model using supervised fine-tuning from a JSONL file. + + This is a convenience function that combines iterate_file() and + create_sft_dataset_iterator() to provide a simple interface for SFT training. + + Args: + model: The TrainableModel to fine-tune. Must be registered with a backend. + file_path: Path to JSONL file containing training data. Each line should have: + - messages: List of chat messages + - tools: Optional list of tools + epochs: Number of times to iterate over the dataset. Default: 1 + batch_size: Number of trajectories per batch (one weight update per batch). Default: 1 + chunk_size: Number of batches to process per train_sft call. Default: 50. + This is an internal optimization parameter and does not affect training. + peak_lr: Peak learning rate. Default: 2e-4 + schedule_type: Learning rate schedule type ("cosine", "linear", "constant"). Default: "linear" + warmup_ratio: Ratio of total steps to use for warmup (0.0 to 1.0). Default: 0.1 + initial_step: The global training step (batch) to start from. Default: 0. + Useful for resuming training. + use_tqdm: Whether to display a progress bar. Default: True + _config: Additional experimental configuration. Use at your own risk. + verbose: Whether to print verbose output. Default: False + + Example: + import art + from art.local import LocalBackend + from art.utils.sft import train_sft_from_file + + async def main(): + backend = LocalBackend() + model = art.TrainableModel( + name="my-model", + project="my-project", + base_model="Qwen/Qwen2.5-7B-Instruct", + ) + await model.register(backend) + + # Train with linear decay schedule + await train_sft_from_file( + model=model, + file_path="data/train.jsonl", + epochs=3, + batch_size=4, + peak_lr=2e-4, + schedule_type="linear", + ) + + # Train with cosine schedule and warmup + await train_sft_from_file( + model=model, + file_path="data/train.jsonl", + epochs=1, + batch_size=2, + peak_lr=1e-4, + schedule_type="cosine", + warmup_ratio=0.1, + ) + """ + # Load all trajectories into memory (needed for shuffling across epochs) + trajectories = list(iterate_file(file_path, epochs=1, shuffle=False)) + + if verbose: + print(f"Loaded {len(trajectories)} trajectories from {file_path}") + + # Create dataset iterator and train + for chunk in create_sft_dataset_iterator( + trajectories=trajectories, + epochs=epochs, + batch_size=batch_size, + chunk_size=chunk_size, + peak_lr=peak_lr, + schedule_type=schedule_type, + warmup_ratio=warmup_ratio, + initial_step=initial_step, + use_tqdm=use_tqdm, + ): + await model.train_sft( + chunk.trajectories, + chunk.config, + _config=_config, + verbose=verbose, + ) diff --git a/tests/unit/test_sft.py b/tests/unit/test_sft.py new file mode 100644 index 00000000..a523f960 --- /dev/null +++ b/tests/unit/test_sft.py @@ -0,0 +1,274 @@ +"""Unit tests for SFT utilities.""" + +import json +from pathlib import Path +import tempfile + +import pytest + +from art.trajectories import Trajectory +from art.utils.sft import create_lr_schedule, create_sft_dataset_iterator, iterate_file + + +# Helper to create dummy trajectories +def create_dummy_trajectory(idx: int) -> Trajectory: + """Create a dummy trajectory with a unique identifier.""" + return Trajectory( + messages_and_choices=[ + {"role": "user", "content": f"Message {idx}"}, + {"role": "assistant", "content": f"Response {idx}"}, + ], + reward=float(idx), + ) + + +# Helper to create a temporary JSONL file +def create_temp_jsonl(num_trajectories: int) -> Path: + """Create a temporary JSONL file with dummy trajectories.""" + temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) + for i in range(num_trajectories): + data = { + "messages": [ + {"role": "user", "content": f"Message {i}"}, + {"role": "assistant", "content": f"Response {i}"}, + ], + } + temp_file.write(json.dumps(data) + "\n") + temp_file.close() + return Path(temp_file.name) + + +# ============================================================================ +# Integration tests +# ============================================================================ + + +def test_create_sft_dataset_iterator(): + """Test create_sft_dataset_iterator yields correct chunks.""" + trajectories = [create_dummy_trajectory(i) for i in range(20)] + + # batch_size=8, chunk_size=2 means each chunk has up to 2 batches of 8 trajectories + # With 20 trajectories per epoch: + # - Items per chunk: 8 * 2 = 16 + # - Chunks per epoch: ceil(20/16) = 2 (one with 16 trajs, one with 4 trajs) + # With 3 epochs: 2 * 3 = 6 chunks total + + chunks = list( + create_sft_dataset_iterator( + trajectories, + epochs=3, + batch_size=8, # 8 trajectories per batch + chunk_size=2, # 2 batches per chunk + use_tqdm=False, + ) + ) + + # Should have 6 chunks total (2 per epoch * 3 epochs) + assert len(chunks) == 6 + + # Pattern repeats for each epoch: full chunk (16 trajs), partial chunk (4 trajs) + assert len(chunks[0].trajectories) == 16 # Epoch 1, chunk 1 + assert len(chunks[1].trajectories) == 4 # Epoch 1, chunk 2 (partial) + assert len(chunks[2].trajectories) == 16 # Epoch 2, chunk 1 + assert len(chunks[3].trajectories) == 4 # Epoch 2, chunk 2 (partial) + assert len(chunks[4].trajectories) == 16 # Epoch 3, chunk 1 + assert len(chunks[5].trajectories) == 4 # Epoch 3, chunk 2 (partial) + + # Verify chunk metadata + # Chunk 0: starts at batch 0 (16 trajectories = 2 batches) + assert chunks[0].step == 0 + assert chunks[0].epoch == 0 + assert chunks[0].epoch_step == 0 + + # Chunk 1: starts at batch 2 (after 2 batches from chunk 0) + # chunk_start=16, global_batch_step = ceil(16/8) = 2 + assert chunks[1].step == 2 + assert chunks[1].epoch == 0 + assert chunks[1].epoch_step == 2 + + +def test_iterate_file(): + """Test iterate_file reads trajectories correctly.""" + jsonl_file = create_temp_jsonl(10) + + try: + # Read without shuffle + trajectories = list( + iterate_file( + str(jsonl_file), + epochs=2, + shuffle=False, + ) + ) + + # Should have 20 trajectories (10 per epoch * 2 epochs) + assert len(trajectories) == 20 + + # Verify the content - first epoch should be in order + for i in range(10): + assert f"Message {i}" in str(trajectories[i].messages_and_choices) + + finally: + jsonl_file.unlink() + + +def test_iterate_file_with_shuffle(): + """Test iterate_file with shuffle enabled.""" + jsonl_file = create_temp_jsonl(100) + + try: + # Read with shuffle + trajectories = list( + iterate_file( + str(jsonl_file), + epochs=2, + shuffle=True, + shuffle_buffer_size=10, + ) + ) + + # Should have 200 trajectories + assert len(trajectories) == 200 + + finally: + jsonl_file.unlink() + + +def test_chunk_size_validation(): + """Test that chunk_size < 1 raises an error.""" + trajectories = [create_dummy_trajectory(i) for i in range(10)] + + with pytest.raises(ValueError, match="chunk_size must be >= 1"): + list(create_sft_dataset_iterator(trajectories, chunk_size=0, use_tqdm=False)) + + with pytest.raises(ValueError, match="chunk_size must be >= 1"): + list(create_sft_dataset_iterator(trajectories, chunk_size=-1, use_tqdm=False)) + + +def test_lr_schedule_warmup_not_zero(): + """Test that warmup doesn't start at lr=0.""" + lrs = create_lr_schedule( + total_steps=10, + peak_lr=1e-4, + method="constant", + warmup_steps=5, + min_lr=0.0, + ) + + # First step should NOT be 0 + assert lrs[0] > 0 + # Should reach peak_lr by end of warmup + assert lrs[4] == pytest.approx(1e-4) + # After warmup, should stay at peak_lr (constant schedule) + assert lrs[5] == pytest.approx(1e-4) + + +def test_lr_schedule_edge_cases(): + """Test LR schedule edge cases.""" + # Empty schedule + lrs = create_lr_schedule(total_steps=0, peak_lr=1e-4) + assert lrs == [] + + # Single step + lrs = create_lr_schedule(total_steps=1, peak_lr=1e-4) + assert len(lrs) == 1 + assert lrs[0] == pytest.approx(1e-4) + + # Warmup steps >= total_steps (edge case) + lrs = create_lr_schedule(total_steps=5, peak_lr=1e-4, warmup_steps=10) + assert len(lrs) == 5 + # Should not crash and should produce valid learning rates + assert all(lr > 0 for lr in lrs) + + +def test_lr_schedule_decay_methods(): + """Test that cosine and linear decay work correctly.""" + peak_lr = 1e-4 + min_lr = 1e-5 + + # Linear decay: should go from peak_lr to min_lr + lrs = create_lr_schedule( + total_steps=5, peak_lr=peak_lr, method="linear", min_lr=min_lr + ) + assert lrs[0] == pytest.approx(peak_lr) # Start at peak + assert lrs[-1] == pytest.approx(min_lr) # End at min + # Should be monotonically decreasing + for i in range(len(lrs) - 1): + assert lrs[i] >= lrs[i + 1] + + # Cosine decay: should go from peak_lr to min_lr + lrs = create_lr_schedule( + total_steps=5, peak_lr=peak_lr, method="cosine", min_lr=min_lr + ) + assert lrs[0] == pytest.approx(peak_lr) # Start at peak + assert lrs[-1] == pytest.approx(min_lr) # End at min + + +def test_lr_schedule_no_warmup(): + """Test schedule with warmup_steps=0.""" + lrs = create_lr_schedule( + total_steps=5, peak_lr=1e-4, method="linear", warmup_steps=0, min_lr=0 + ) + assert len(lrs) == 5 + assert lrs[0] == pytest.approx(1e-4) # Start at peak (no warmup) + assert lrs[-1] == pytest.approx(0) # End at min_lr + + +def test_create_sft_dataset_iterator_with_initial_step(): + """Test resuming from initial_step skips correct number of batches.""" + trajectories = [create_dummy_trajectory(i) for i in range(20)] + + # Without initial_step: should get all chunks + all_chunks = list( + create_sft_dataset_iterator( + trajectories, epochs=1, batch_size=4, chunk_size=2, use_tqdm=False + ) + ) + + # With initial_step=2: should skip first 2 batches (first chunk) + resumed_chunks = list( + create_sft_dataset_iterator( + trajectories, + epochs=1, + batch_size=4, + chunk_size=2, + initial_step=2, + use_tqdm=False, + ) + ) + + # Should have fewer chunks when resuming + assert len(resumed_chunks) < len(all_chunks) + # First resumed chunk should start at step 2 or later + assert resumed_chunks[0].step >= 2 + + +def test_create_sft_dataset_iterator_epoch_shuffling(): + """Test that different epochs have different trajectory orderings.""" + trajectories = [create_dummy_trajectory(i) for i in range(10)] + + chunks = list( + create_sft_dataset_iterator( + trajectories, + epochs=2, + batch_size=10, # One batch per epoch + chunk_size=1, + use_tqdm=False, + ) + ) + + # Should have 2 chunks (one per epoch) + assert len(chunks) == 2 + + # Different epochs should have different orderings (due to shuffle) + epoch0_contents = [ + t.messages_and_choices[0]["content"] for t in chunks[0].trajectories + ] + epoch1_contents = [ + t.messages_and_choices[0]["content"] for t in chunks[1].trajectories + ] + assert epoch0_contents != epoch1_contents + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/uv.lock b/uv.lock index b66b314e..366a3519 100644 --- a/uv.lock +++ b/uv.lock @@ -5156,8 +5156,8 @@ requires-dist = [ { name = "unsloth", marker = "extra == 'backend'", specifier = "==2025.12.9" }, { name = "unsloth-zoo", marker = "extra == 'backend'", specifier = "==2025.12.7" }, { name = "vllm", marker = "extra == 'backend'", specifier = "==0.13.0" }, - { name = "wandb", marker = "extra == 'backend'", specifier = "==0.23.1" }, - { name = "weave", specifier = ">=0.52.23" }, + { name = "wandb", marker = "extra == 'backend'", specifier = "==0.24.0" }, + { name = "weave", specifier = ">=0.52.24" }, ] provides-extras = ["plotting", "backend", "skypilot", "langgraph"] @@ -9127,7 +9127,7 @@ wheels = [ [[package]] name = "wandb" -version = "0.23.1" +version = "0.24.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click" }, @@ -9141,17 +9141,17 @@ dependencies = [ { name = "sentry-sdk" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/0a/cc/770ae3aa7ae44f6792f7ecb81c14c0e38b672deb35235719bb1006519487/wandb-0.23.1.tar.gz", hash = "sha256:f6fb1e3717949b29675a69359de0eeb01e67d3360d581947d5b3f98c273567d6", size = 44298053, upload-time = "2025-12-03T02:25:10.79Z" } +sdist = { url = "https://files.pythonhosted.org/packages/27/7e/aad6e943012ea4d88f3a037f1a5a7c6898263c60fbef8c9cdb95a8ff9fd9/wandb-0.24.0.tar.gz", hash = "sha256:4715a243b3d460b6434b9562e935dfd9dfdf5d6e428cfb4c3e7ce4fd44460ab3", size = 44197947, upload-time = "2026-01-13T22:59:59.767Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/12/0b/c3d7053dfd93fd259a63c7818d9c4ac2ba0642ff8dc8db98662ea0cf9cc0/wandb-0.23.1-py3-none-macosx_12_0_arm64.whl", hash = "sha256:358e15471d19b7d73fc464e37371c19d44d39e433252ac24df107aff993a286b", size = 21527293, upload-time = "2025-12-03T02:24:48.011Z" }, - { url = "https://files.pythonhosted.org/packages/ee/9f/059420fa0cb6c511dc5c5a50184122b6aca7b178cb2aa210139e354020da/wandb-0.23.1-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:110304407f4b38f163bdd50ed5c5225365e4df3092f13089c30171a75257b575", size = 22745926, upload-time = "2025-12-03T02:24:50.519Z" }, - { url = "https://files.pythonhosted.org/packages/96/b6/fd465827c14c64d056d30b4c9fcf4dac889a6969dba64489a88fc4ffa333/wandb-0.23.1-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:6cc984cf85feb2f8ee0451d76bc9fb7f39da94956bb8183e30d26284cf203b65", size = 21212973, upload-time = "2025-12-03T02:24:52.828Z" }, - { url = "https://files.pythonhosted.org/packages/5c/ee/9a8bb9a39cc1f09c3060456cc79565110226dc4099a719af5c63432da21d/wandb-0.23.1-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:67431cd3168d79fdb803e503bd669c577872ffd5dadfa86de733b3274b93088e", size = 22887885, upload-time = "2025-12-03T02:24:55.281Z" }, - { url = "https://files.pythonhosted.org/packages/6d/4d/8d9e75add529142e037b05819cb3ab1005679272950128d69d218b7e5b2e/wandb-0.23.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:07be70c0baa97ea25fadc4a9d0097f7371eef6dcacc5ceb525c82491a31e9244", size = 21250967, upload-time = "2025-12-03T02:24:57.603Z" }, - { url = "https://files.pythonhosted.org/packages/97/72/0b35cddc4e4168f03c759b96d9f671ad18aec8bdfdd84adfea7ecb3f5701/wandb-0.23.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:216c95b08e0a2ec6a6008373b056d597573d565e30b43a7a93c35a171485ee26", size = 22988382, upload-time = "2025-12-03T02:25:00.518Z" }, - { url = "https://files.pythonhosted.org/packages/c0/6d/e78093d49d68afb26f5261a70fc7877c34c114af5c2ee0ab3b1af85f5e76/wandb-0.23.1-py3-none-win32.whl", hash = "sha256:fb5cf0f85692f758a5c36ab65fea96a1284126de64e836610f92ddbb26df5ded", size = 22150756, upload-time = "2025-12-03T02:25:02.734Z" }, - { url = "https://files.pythonhosted.org/packages/05/27/4f13454b44c9eceaac3d6e4e4efa2230b6712d613ff9bf7df010eef4fd18/wandb-0.23.1-py3-none-win_amd64.whl", hash = "sha256:21c8c56e436eb707b7d54f705652e030d48e5cfcba24cf953823eb652e30e714", size = 22150760, upload-time = "2025-12-03T02:25:05.106Z" }, - { url = "https://files.pythonhosted.org/packages/30/20/6c091d451e2a07689bfbfaeb7592d488011420e721de170884fedd68c644/wandb-0.23.1-py3-none-win_arm64.whl", hash = "sha256:8aee7f3bb573f2c0acf860f497ca9c684f9b35f2ca51011ba65af3d4592b77c1", size = 20137463, upload-time = "2025-12-03T02:25:08.317Z" }, + { url = "https://files.pythonhosted.org/packages/5f/8a/efec186dcc5dcf3c806040e3f33e58997878b2d30b87aa02b26f046858b6/wandb-0.24.0-py3-none-macosx_12_0_arm64.whl", hash = "sha256:aa9777398ff4b0f04c41359f7d1b95b5d656cb12c37c63903666799212e50299", size = 21464901, upload-time = "2026-01-13T22:59:31.86Z" }, + { url = "https://files.pythonhosted.org/packages/ed/84/fadf0d5f1d86c3ba662d2b33a15d2b1f08ff1e4e196c77e455f028b0fda2/wandb-0.24.0-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:0423fbd58c3926949724feae8aab89d20c68846f9f4f596b80f9ffe1fc298130", size = 22697817, upload-time = "2026-01-13T22:59:35.267Z" }, + { url = "https://files.pythonhosted.org/packages/6e/5f/e3124e68d02b30c62856175ce714e07904730be06eecb00f66bb1a59aacf/wandb-0.24.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:2b25fc0c123daac97ed32912ac55642c65013cc6e3a898e88ca2d917fc8eadc0", size = 21118798, upload-time = "2026-01-13T22:59:38.453Z" }, + { url = "https://files.pythonhosted.org/packages/22/a1/8d68a914c030e897c306c876d47c73aa5d9ca72be608971290d3a5749570/wandb-0.24.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:9485344b4667944b5b77294185bae8469cfa4074869bec0e74f54f8492234cc2", size = 22849954, upload-time = "2026-01-13T22:59:41.265Z" }, + { url = "https://files.pythonhosted.org/packages/e9/f8/3e68841a4282a4fb6a8935534e6064acc6c9708e8fb76953ec73bbc72a5e/wandb-0.24.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:51b2b9a9d7d6b35640f12a46a48814fd4516807ad44f586b819ed6560f8de1fd", size = 21160339, upload-time = "2026-01-13T22:59:43.967Z" }, + { url = "https://files.pythonhosted.org/packages/16/e5/d851868ce5b4b437a7cc90405979cd83809790e4e2a2f1e454f63f116e52/wandb-0.24.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:11f7e7841f31eff82c82a677988889ad3aa684c6de61ff82145333b5214ec860", size = 22936978, upload-time = "2026-01-13T22:59:46.911Z" }, + { url = "https://files.pythonhosted.org/packages/d2/34/43b7f18870051047ce6fe18e7eb24ba7ebdc71663a8f1c58e31e855eb8ac/wandb-0.24.0-py3-none-win32.whl", hash = "sha256:42af348998b00d4309ae790c5374040ac6cc353ab21567f4e29c98c9376dee8e", size = 22118243, upload-time = "2026-01-13T22:59:49.555Z" }, + { url = "https://files.pythonhosted.org/packages/a1/92/909c81173cf1399111f57f9ca5399a8f165607b024e406e080178c878f70/wandb-0.24.0-py3-none-win_amd64.whl", hash = "sha256:32604eddcd362e1ed4a2e2ce5f3a239369c4a193af223f3e66603481ac91f336", size = 22118246, upload-time = "2026-01-13T22:59:52.126Z" }, + { url = "https://files.pythonhosted.org/packages/87/85/a845aefd9c2285f98261fa6ffa0a14466366c1ac106d35bc84b654c0ad7f/wandb-0.24.0-py3-none-win_arm64.whl", hash = "sha256:e0f2367552abfca21b0f3a03405fbf48f1e14de9846e70f73c6af5da57afd8ef", size = 20077678, upload-time = "2026-01-13T22:59:56.112Z" }, ] [[package]] @@ -9291,7 +9291,7 @@ wheels = [ [[package]] name = "weave" -version = "0.52.23" +version = "0.52.24" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click" }, @@ -9306,9 +9306,9 @@ dependencies = [ { name = "tzdata", marker = "sys_platform == 'win32'" }, { name = "wandb" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/a2/3c/119b64e092218f7c37f9ca65bfda8ef856be6c03687e5701fa67fd16f3df/weave-0.52.23.tar.gz", hash = "sha256:ad4f37cc901cb93a000faedbe1313509e97c2e7a18cbaada574fef820741ff32", size = 647645, upload-time = "2026-01-08T18:23:46.71Z" } +sdist = { url = "https://files.pythonhosted.org/packages/37/a9/479a4ebd918bf8d08ea1d8daf1a40ace4c6940f86472700036436c6acc78/weave-0.52.24.tar.gz", hash = "sha256:1d756c4b35054d8efba0300eb2a595f9c63d96601f1a3f37918e620742c13481", size = 650113, upload-time = "2026-01-15T20:46:41.771Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/1e/06/489587dd6f5d06f19535ffbf6cd98e061eef3b50583d8dc40d79271c470b/weave-0.52.23-py3-none-any.whl", hash = "sha256:debe9eac5bdec857dc2507401f075b1b1f3199eccc9ef29f106747c13b23966c", size = 810639, upload-time = "2026-01-08T18:23:44.593Z" }, + { url = "https://files.pythonhosted.org/packages/c4/b1/b900fb5e81bbb143ef69ea49b2a2024357f6423511b24d14ab65dbe80272/weave-0.52.24-py3-none-any.whl", hash = "sha256:afe79bb5326aaf22751f2bae20878354062794c3577ab7a28c67512b784d8992", size = 813312, upload-time = "2026-01-15T20:46:39.725Z" }, ] [[package]]