Skip to content

Commit

Permalink
📝 Style Transfer: update dataset information to reflect GPT4-o token …
Browse files Browse the repository at this point in the history
…replacement
  • Loading branch information
simonmeoni committed Nov 4, 2024
1 parent 627f2c5 commit 8e41417
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 14 deletions.
Original file line number Diff line number Diff line change
@@ -1,21 +1,32 @@
import json
import os
import time
from pathlib import Path

from openai import OpenAI
import time

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def replace_tokens_with_gpt4(text):
"""Replace anonymized tokens with fake but realistic data using GPT-4."""
try:
response = client.chat.completions.create(
model="gpt-4",
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a helpful assistant that replaces anonymized medical tokens with realistic but fake data. Maintain medical context and consistency."},
{"role": "user", "content": f"Replace anonymized tokens in this text with realistic but fake medical data. Preserve the medical context: {text}"}
{
"role": "system",
"content": "You are a helpful assistant that replaces anonymized "
"medical tokens with realistic but fake data. "
"Maintain medical context and consistency.",
},
{
"role": "user",
"content": f"Replace anonymized tokens in this text with realistic but "
f"fake medical data. Preserve the medical context: {text}",
},
],
temperature=0.7
temperature=0.7,
)
return response.choices[0].message.content.strip()
except Exception as e:
Expand All @@ -39,7 +50,7 @@ def filter_entries_by_keyword_count(input_file, output_file, min_keywords=50, ma
# Add rate limiting to avoid API throttling
time.sleep(1)
filtered_entries.append(entry)

if filtered_entries:
filtered_data[key] = filtered_entries

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This is the huggingface dataset for yelp review style transfer.
"""This is the huggingface dataset for MIMIC-III clinical notes with GPT4O token replacements.
"""

import json
Expand All @@ -20,11 +20,12 @@
import datasets

_DESCRIPTION = """\
This dataset is a collection of clinical cases from mimic-iii that have been preprocessed
to be used for style transfer.
This dataset is a collection of clinical cases from MIMIC-III where sensitive tokens have been
identified and replaced using GPT4o to maintain privacy while preserving clinical meaning.
The dataset is preprocessed for style transfer tasks.
"""

_DATASET_NAME = "mimic_iii_dataset"
_DATASET_NAME = "mimic_iii_gpt4o_tokens"

_HOMEPAGE = "https://github.com/arkhn/ai-lembic"

Expand Down Expand Up @@ -74,16 +75,18 @@
"""


class MimicIiiDataset(datasets.GeneratorBasedBuilder):
"""This is the huggingface dataset for mimic III style transfer."""
class MimicIiiGPT4ODataset(datasets.GeneratorBasedBuilder):
"""This is the huggingface dataset for MIMIC-III with GPT4O token replacements for style
transfer."""

VERSION = datasets.Version("0.1.0")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name=_DATASET_NAME,
version=VERSION,
description="This is a collection of clinical cases from mimic iii "
"that have been preprocessed to be used for style transfer.",
description="This is a collection of clinical cases from MIMIC-III "
"with GPT4O token replacements for privacy preservation, "
"preprocessed for style transfer tasks.",
),
]

Expand Down

0 comments on commit 8e41417

Please sign in to comment.