📝 Style Transfer: update dataset information to reflect GPT4-o token …

…replacement
arkhn · Nov 4, 2024 · 8e41417 · 8e41417
1 parent 627f2c5
commit 8e41417
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 14 deletions.
diff --git a/lib/style-transfer/hf_datasets/mimic_iii_wt_replaced_tokens/_postprocessing.py b/lib/style-transfer/hf_datasets/mimic_iii_wt_replaced_tokens/_postprocessing.py
@@ -1,21 +1,32 @@
 import json
 import os
+import time
 from pathlib import Path
+
 from openai import OpenAI
-import time
 
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
+
 def replace_tokens_with_gpt4(text):
     """Replace anonymized tokens with fake but realistic data using GPT-4."""
     try:
         response = client.chat.completions.create(
-            model="gpt-4",
+            model="gpt-4o",
             messages=[
-                {"role": "system", "content": "You are a helpful assistant that replaces anonymized medical tokens with realistic but fake data. Maintain medical context and consistency."},
-                {"role": "user", "content": f"Replace anonymized tokens in this text with realistic but fake medical data. Preserve the medical context: {text}"}
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant that replaces anonymized "
+                    "medical tokens with realistic but fake data. "
+                    "Maintain medical context and consistency.",
+                },
+                {
+                    "role": "user",
+                    "content": f"Replace anonymized tokens in this text with realistic but "
+                    f"fake medical data. Preserve the medical context: {text}",
+                },
             ],
-            temperature=0.7
+            temperature=0.7,
         )
         return response.choices[0].message.content.strip()
     except Exception as e:
@@ -39,7 +50,7 @@ def filter_entries_by_keyword_count(input_file, output_file, min_keywords=50, ma
                         # Add rate limiting to avoid API throttling
                         time.sleep(1)
                         filtered_entries.append(entry)
-                
+
                 if filtered_entries:
                     filtered_data[key] = filtered_entries
 

diff --git a/...t_replaced_tokens/mimic_style_transfer.py → ...replaced_tokens/mimic_iii_gpt4o_tokens.py b/...t_replaced_tokens/mimic_style_transfer.py → ...replaced_tokens/mimic_iii_gpt4o_tokens.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""This is the huggingface dataset for yelp review style transfer.
+"""This is the huggingface dataset for MIMIC-III clinical notes with GPT4O token replacements.
 """
 
 import json
@@ -20,11 +20,12 @@
 import datasets
 
 _DESCRIPTION = """\
-This dataset is a collection of clinical cases from mimic-iii that have been preprocessed
-to be used for style transfer.
+This dataset is a collection of clinical cases from MIMIC-III where sensitive tokens have been
+identified and replaced using GPT4o to maintain privacy while preserving clinical meaning.
+The dataset is preprocessed for style transfer tasks.
 """
 
-_DATASET_NAME = "mimic_iii_dataset"
+_DATASET_NAME = "mimic_iii_gpt4o_tokens"
 
 _HOMEPAGE = "https://github.com/arkhn/ai-lembic"
 
@@ -74,16 +75,18 @@
 """
 
 
-class MimicIiiDataset(datasets.GeneratorBasedBuilder):
-    """This is the huggingface dataset for mimic III style transfer."""
+class MimicIiiGPT4ODataset(datasets.GeneratorBasedBuilder):
+    """This is the huggingface dataset for MIMIC-III with GPT4O token replacements for style
+    transfer."""
 
     VERSION = datasets.Version("0.1.0")
     BUILDER_CONFIGS = [
         datasets.BuilderConfig(
             name=_DATASET_NAME,
             version=VERSION,
-            description="This is a collection of clinical cases from mimic iii "
-            "that have been preprocessed to be used for style transfer.",
+            description="This is a collection of clinical cases from MIMIC-III "
+            "with GPT4O token replacements for privacy preservation, "
+            "preprocessed for style transfer tasks.",
         ),
     ]